162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* memcontrol.c - Memory Controller 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Copyright IBM Corporation, 2007 562306a36Sopenharmony_ci * Author Balbir Singh <balbir@linux.vnet.ibm.com> 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Copyright 2007 OpenVZ SWsoft Inc 862306a36Sopenharmony_ci * Author: Pavel Emelianov <xemul@openvz.org> 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Memory thresholds 1162306a36Sopenharmony_ci * Copyright (C) 2009 Nokia Corporation 1262306a36Sopenharmony_ci * Author: Kirill A. Shutemov 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * Kernel Memory Controller 1562306a36Sopenharmony_ci * Copyright (C) 2012 Parallels Inc. and Google Inc. 1662306a36Sopenharmony_ci * Authors: Glauber Costa and Suleiman Souhlal 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * Native page reclaim 1962306a36Sopenharmony_ci * Charge lifetime sanitation 2062306a36Sopenharmony_ci * Lockless page tracking & accounting 2162306a36Sopenharmony_ci * Unified hierarchy configuration model 2262306a36Sopenharmony_ci * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * Per memcg lru locking 2562306a36Sopenharmony_ci * Copyright (C) 2020 Alibaba, Inc, Alex Shi 2662306a36Sopenharmony_ci */ 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_ci#include <linux/page_counter.h> 2962306a36Sopenharmony_ci#include <linux/memcontrol.h> 3062306a36Sopenharmony_ci#include <linux/cgroup.h> 3162306a36Sopenharmony_ci#include <linux/pagewalk.h> 3262306a36Sopenharmony_ci#include <linux/sched/mm.h> 3362306a36Sopenharmony_ci#include <linux/shmem_fs.h> 3462306a36Sopenharmony_ci#include <linux/hugetlb.h> 3562306a36Sopenharmony_ci#include <linux/pagemap.h> 3662306a36Sopenharmony_ci#include <linux/vm_event_item.h> 3762306a36Sopenharmony_ci#include <linux/smp.h> 3862306a36Sopenharmony_ci#include <linux/page-flags.h> 3962306a36Sopenharmony_ci#include <linux/backing-dev.h> 4062306a36Sopenharmony_ci#include <linux/bit_spinlock.h> 4162306a36Sopenharmony_ci#include <linux/rcupdate.h> 4262306a36Sopenharmony_ci#include <linux/limits.h> 4362306a36Sopenharmony_ci#include <linux/export.h> 4462306a36Sopenharmony_ci#include <linux/mutex.h> 4562306a36Sopenharmony_ci#include <linux/rbtree.h> 4662306a36Sopenharmony_ci#include <linux/slab.h> 4762306a36Sopenharmony_ci#include <linux/swap.h> 4862306a36Sopenharmony_ci#include <linux/swapops.h> 4962306a36Sopenharmony_ci#include <linux/spinlock.h> 5062306a36Sopenharmony_ci#include <linux/eventfd.h> 5162306a36Sopenharmony_ci#include <linux/poll.h> 5262306a36Sopenharmony_ci#include <linux/sort.h> 5362306a36Sopenharmony_ci#include <linux/fs.h> 5462306a36Sopenharmony_ci#include <linux/seq_file.h> 5562306a36Sopenharmony_ci#include <linux/vmpressure.h> 5662306a36Sopenharmony_ci#include <linux/memremap.h> 5762306a36Sopenharmony_ci#include <linux/mm_inline.h> 5862306a36Sopenharmony_ci#include <linux/swap_cgroup.h> 5962306a36Sopenharmony_ci#include <linux/cpu.h> 6062306a36Sopenharmony_ci#include <linux/oom.h> 6162306a36Sopenharmony_ci#include <linux/lockdep.h> 6262306a36Sopenharmony_ci#include <linux/file.h> 6362306a36Sopenharmony_ci#include <linux/resume_user_mode.h> 6462306a36Sopenharmony_ci#include <linux/psi.h> 6562306a36Sopenharmony_ci#include <linux/seq_buf.h> 6662306a36Sopenharmony_ci#include <linux/sched/isolation.h> 6762306a36Sopenharmony_ci#include "internal.h" 6862306a36Sopenharmony_ci#include <net/sock.h> 6962306a36Sopenharmony_ci#include <net/ip.h> 7062306a36Sopenharmony_ci#include "slab.h" 7162306a36Sopenharmony_ci#include "swap.h" 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci#include <linux/uaccess.h> 7462306a36Sopenharmony_ci#include <linux/zswapd.h> 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci#include <trace/events/vmscan.h> 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys __read_mostly; 7962306a36Sopenharmony_ciEXPORT_SYMBOL(memory_cgrp_subsys); 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_cistruct mem_cgroup *root_mem_cgroup __read_mostly; 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci/* Active memory cgroup to use from an interrupt context */ 8462306a36Sopenharmony_ciDEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 8562306a36Sopenharmony_ciEXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci/* Socket memory accounting disabled? */ 8862306a36Sopenharmony_cistatic bool cgroup_memory_nosocket __ro_after_init; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci/* Kernel memory accounting disabled? */ 9162306a36Sopenharmony_cistatic bool cgroup_memory_nokmem = true; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci/* BPF memory accounting disabled? */ 9462306a36Sopenharmony_cistatic bool cgroup_memory_nobpf __ro_after_init; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 9762306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 9862306a36Sopenharmony_ci#endif 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci/* Whether legacy memory+swap accounting is active */ 10162306a36Sopenharmony_cistatic bool do_memsw_account(void) 10262306a36Sopenharmony_ci{ 10362306a36Sopenharmony_ci return !cgroup_subsys_on_dfl(memory_cgrp_subsys); 10462306a36Sopenharmony_ci} 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci#define THRESHOLDS_EVENTS_TARGET 128 10762306a36Sopenharmony_ci#define SOFTLIMIT_EVENTS_TARGET 1024 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci/* 11062306a36Sopenharmony_ci * Cgroups above their limits are maintained in a RB-Tree, independent of 11162306a36Sopenharmony_ci * their hierarchy representation 11262306a36Sopenharmony_ci */ 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_cistruct mem_cgroup_tree_per_node { 11562306a36Sopenharmony_ci struct rb_root rb_root; 11662306a36Sopenharmony_ci struct rb_node *rb_rightmost; 11762306a36Sopenharmony_ci spinlock_t lock; 11862306a36Sopenharmony_ci}; 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_cistruct mem_cgroup_tree { 12162306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 12262306a36Sopenharmony_ci}; 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_cistatic struct mem_cgroup_tree soft_limit_tree __read_mostly; 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci/* for OOM */ 12762306a36Sopenharmony_cistruct mem_cgroup_eventfd_list { 12862306a36Sopenharmony_ci struct list_head list; 12962306a36Sopenharmony_ci struct eventfd_ctx *eventfd; 13062306a36Sopenharmony_ci}; 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci/* 13362306a36Sopenharmony_ci * cgroup_event represents events which userspace want to receive. 13462306a36Sopenharmony_ci */ 13562306a36Sopenharmony_cistruct mem_cgroup_event { 13662306a36Sopenharmony_ci /* 13762306a36Sopenharmony_ci * memcg which the event belongs to. 13862306a36Sopenharmony_ci */ 13962306a36Sopenharmony_ci struct mem_cgroup *memcg; 14062306a36Sopenharmony_ci /* 14162306a36Sopenharmony_ci * eventfd to signal userspace about the event. 14262306a36Sopenharmony_ci */ 14362306a36Sopenharmony_ci struct eventfd_ctx *eventfd; 14462306a36Sopenharmony_ci /* 14562306a36Sopenharmony_ci * Each of these stored in a list by the cgroup. 14662306a36Sopenharmony_ci */ 14762306a36Sopenharmony_ci struct list_head list; 14862306a36Sopenharmony_ci /* 14962306a36Sopenharmony_ci * register_event() callback will be used to add new userspace 15062306a36Sopenharmony_ci * waiter for changes related to this event. Use eventfd_signal() 15162306a36Sopenharmony_ci * on eventfd to send notification to userspace. 15262306a36Sopenharmony_ci */ 15362306a36Sopenharmony_ci int (*register_event)(struct mem_cgroup *memcg, 15462306a36Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args); 15562306a36Sopenharmony_ci /* 15662306a36Sopenharmony_ci * unregister_event() callback will be called when userspace closes 15762306a36Sopenharmony_ci * the eventfd or on cgroup removing. This callback must be set, 15862306a36Sopenharmony_ci * if you want provide notification functionality. 15962306a36Sopenharmony_ci */ 16062306a36Sopenharmony_ci void (*unregister_event)(struct mem_cgroup *memcg, 16162306a36Sopenharmony_ci struct eventfd_ctx *eventfd); 16262306a36Sopenharmony_ci /* 16362306a36Sopenharmony_ci * All fields below needed to unregister event when 16462306a36Sopenharmony_ci * userspace closes eventfd. 16562306a36Sopenharmony_ci */ 16662306a36Sopenharmony_ci poll_table pt; 16762306a36Sopenharmony_ci wait_queue_head_t *wqh; 16862306a36Sopenharmony_ci wait_queue_entry_t wait; 16962306a36Sopenharmony_ci struct work_struct remove; 17062306a36Sopenharmony_ci}; 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg); 17362306a36Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci/* Stuffs for move charges at task migration. */ 17662306a36Sopenharmony_ci/* 17762306a36Sopenharmony_ci * Types of charges to be moved. 17862306a36Sopenharmony_ci */ 17962306a36Sopenharmony_ci#define MOVE_ANON 0x1U 18062306a36Sopenharmony_ci#define MOVE_FILE 0x2U 18162306a36Sopenharmony_ci#define MOVE_MASK (MOVE_ANON | MOVE_FILE) 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci/* "mc" and its members are protected by cgroup_mutex */ 18462306a36Sopenharmony_cistatic struct move_charge_struct { 18562306a36Sopenharmony_ci spinlock_t lock; /* for from, to */ 18662306a36Sopenharmony_ci struct mm_struct *mm; 18762306a36Sopenharmony_ci struct mem_cgroup *from; 18862306a36Sopenharmony_ci struct mem_cgroup *to; 18962306a36Sopenharmony_ci unsigned long flags; 19062306a36Sopenharmony_ci unsigned long precharge; 19162306a36Sopenharmony_ci unsigned long moved_charge; 19262306a36Sopenharmony_ci unsigned long moved_swap; 19362306a36Sopenharmony_ci struct task_struct *moving_task; /* a task moving charges */ 19462306a36Sopenharmony_ci wait_queue_head_t waitq; /* a waitq for other context */ 19562306a36Sopenharmony_ci} mc = { 19662306a36Sopenharmony_ci .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 19762306a36Sopenharmony_ci .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 19862306a36Sopenharmony_ci}; 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci/* 20162306a36Sopenharmony_ci * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 20262306a36Sopenharmony_ci * limit reclaim to prevent infinite loops, if they ever occur. 20362306a36Sopenharmony_ci */ 20462306a36Sopenharmony_ci#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 20562306a36Sopenharmony_ci#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci/* for encoding cft->private value on file */ 20862306a36Sopenharmony_cienum res_type { 20962306a36Sopenharmony_ci _MEM, 21062306a36Sopenharmony_ci _MEMSWAP, 21162306a36Sopenharmony_ci _KMEM, 21262306a36Sopenharmony_ci _TCP, 21362306a36Sopenharmony_ci}; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 21662306a36Sopenharmony_ci#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 21762306a36Sopenharmony_ci#define MEMFILE_ATTR(val) ((val) & 0xffff) 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci/* 22062306a36Sopenharmony_ci * Iteration constructs for visiting all cgroups (under a tree). If 22162306a36Sopenharmony_ci * loops are exited prematurely (break), mem_cgroup_iter_break() must 22262306a36Sopenharmony_ci * be used for reference counting. 22362306a36Sopenharmony_ci */ 22462306a36Sopenharmony_ci#define for_each_mem_cgroup_tree(iter, root) \ 22562306a36Sopenharmony_ci for (iter = mem_cgroup_iter(root, NULL, NULL); \ 22662306a36Sopenharmony_ci iter != NULL; \ 22762306a36Sopenharmony_ci iter = mem_cgroup_iter(root, iter, NULL)) 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci#define for_each_mem_cgroup(iter) \ 23062306a36Sopenharmony_ci for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 23162306a36Sopenharmony_ci iter != NULL; \ 23262306a36Sopenharmony_ci iter = mem_cgroup_iter(NULL, iter, NULL)) 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_cistatic inline bool task_is_dying(void) 23562306a36Sopenharmony_ci{ 23662306a36Sopenharmony_ci return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 23762306a36Sopenharmony_ci (current->flags & PF_EXITING); 23862306a36Sopenharmony_ci} 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci/* Some nice accessors for the vmpressure. */ 24162306a36Sopenharmony_cistruct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 24262306a36Sopenharmony_ci{ 24362306a36Sopenharmony_ci if (!memcg) 24462306a36Sopenharmony_ci memcg = root_mem_cgroup; 24562306a36Sopenharmony_ci return &memcg->vmpressure; 24662306a36Sopenharmony_ci} 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_cistruct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) 24962306a36Sopenharmony_ci{ 25062306a36Sopenharmony_ci return container_of(vmpr, struct mem_cgroup, vmpressure); 25162306a36Sopenharmony_ci} 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 25462306a36Sopenharmony_cistatic DEFINE_SPINLOCK(objcg_lock); 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_cibool mem_cgroup_kmem_disabled(void) 25762306a36Sopenharmony_ci{ 25862306a36Sopenharmony_ci return cgroup_memory_nokmem; 25962306a36Sopenharmony_ci} 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_cistatic void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 26262306a36Sopenharmony_ci unsigned int nr_pages); 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_cistatic void obj_cgroup_release(struct percpu_ref *ref) 26562306a36Sopenharmony_ci{ 26662306a36Sopenharmony_ci struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 26762306a36Sopenharmony_ci unsigned int nr_bytes; 26862306a36Sopenharmony_ci unsigned int nr_pages; 26962306a36Sopenharmony_ci unsigned long flags; 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci /* 27262306a36Sopenharmony_ci * At this point all allocated objects are freed, and 27362306a36Sopenharmony_ci * objcg->nr_charged_bytes can't have an arbitrary byte value. 27462306a36Sopenharmony_ci * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 27562306a36Sopenharmony_ci * 27662306a36Sopenharmony_ci * The following sequence can lead to it: 27762306a36Sopenharmony_ci * 1) CPU0: objcg == stock->cached_objcg 27862306a36Sopenharmony_ci * 2) CPU1: we do a small allocation (e.g. 92 bytes), 27962306a36Sopenharmony_ci * PAGE_SIZE bytes are charged 28062306a36Sopenharmony_ci * 3) CPU1: a process from another memcg is allocating something, 28162306a36Sopenharmony_ci * the stock if flushed, 28262306a36Sopenharmony_ci * objcg->nr_charged_bytes = PAGE_SIZE - 92 28362306a36Sopenharmony_ci * 5) CPU0: we do release this object, 28462306a36Sopenharmony_ci * 92 bytes are added to stock->nr_bytes 28562306a36Sopenharmony_ci * 6) CPU0: stock is flushed, 28662306a36Sopenharmony_ci * 92 bytes are added to objcg->nr_charged_bytes 28762306a36Sopenharmony_ci * 28862306a36Sopenharmony_ci * In the result, nr_charged_bytes == PAGE_SIZE. 28962306a36Sopenharmony_ci * This page will be uncharged in obj_cgroup_release(). 29062306a36Sopenharmony_ci */ 29162306a36Sopenharmony_ci nr_bytes = atomic_read(&objcg->nr_charged_bytes); 29262306a36Sopenharmony_ci WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 29362306a36Sopenharmony_ci nr_pages = nr_bytes >> PAGE_SHIFT; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci if (nr_pages) 29662306a36Sopenharmony_ci obj_cgroup_uncharge_pages(objcg, nr_pages); 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci spin_lock_irqsave(&objcg_lock, flags); 29962306a36Sopenharmony_ci list_del(&objcg->list); 30062306a36Sopenharmony_ci spin_unlock_irqrestore(&objcg_lock, flags); 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci percpu_ref_exit(ref); 30362306a36Sopenharmony_ci kfree_rcu(objcg, rcu); 30462306a36Sopenharmony_ci} 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_cistatic struct obj_cgroup *obj_cgroup_alloc(void) 30762306a36Sopenharmony_ci{ 30862306a36Sopenharmony_ci struct obj_cgroup *objcg; 30962306a36Sopenharmony_ci int ret; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 31262306a36Sopenharmony_ci if (!objcg) 31362306a36Sopenharmony_ci return NULL; 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 31662306a36Sopenharmony_ci GFP_KERNEL); 31762306a36Sopenharmony_ci if (ret) { 31862306a36Sopenharmony_ci kfree(objcg); 31962306a36Sopenharmony_ci return NULL; 32062306a36Sopenharmony_ci } 32162306a36Sopenharmony_ci INIT_LIST_HEAD(&objcg->list); 32262306a36Sopenharmony_ci return objcg; 32362306a36Sopenharmony_ci} 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_cistatic void memcg_reparent_objcgs(struct mem_cgroup *memcg, 32662306a36Sopenharmony_ci struct mem_cgroup *parent) 32762306a36Sopenharmony_ci{ 32862306a36Sopenharmony_ci struct obj_cgroup *objcg, *iter; 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci spin_lock_irq(&objcg_lock); 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci /* 1) Ready to reparent active objcg. */ 33562306a36Sopenharmony_ci list_add(&objcg->list, &memcg->objcg_list); 33662306a36Sopenharmony_ci /* 2) Reparent active objcg and already reparented objcgs to parent. */ 33762306a36Sopenharmony_ci list_for_each_entry(iter, &memcg->objcg_list, list) 33862306a36Sopenharmony_ci WRITE_ONCE(iter->memcg, parent); 33962306a36Sopenharmony_ci /* 3) Move already reparented objcgs to the parent's list */ 34062306a36Sopenharmony_ci list_splice(&memcg->objcg_list, &parent->objcg_list); 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci spin_unlock_irq(&objcg_lock); 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci percpu_ref_kill(&objcg->refcnt); 34562306a36Sopenharmony_ci} 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci/* 34862306a36Sopenharmony_ci * A lot of the calls to the cache allocation functions are expected to be 34962306a36Sopenharmony_ci * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 35062306a36Sopenharmony_ci * conditional to this static branch, we'll have to allow modules that does 35162306a36Sopenharmony_ci * kmem_cache_alloc and the such to see this symbol as well 35262306a36Sopenharmony_ci */ 35362306a36Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); 35462306a36Sopenharmony_ciEXPORT_SYMBOL(memcg_kmem_online_key); 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); 35762306a36Sopenharmony_ciEXPORT_SYMBOL(memcg_bpf_enabled_key); 35862306a36Sopenharmony_ci#endif 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_ci/** 36162306a36Sopenharmony_ci * mem_cgroup_css_from_folio - css of the memcg associated with a folio 36262306a36Sopenharmony_ci * @folio: folio of interest 36362306a36Sopenharmony_ci * 36462306a36Sopenharmony_ci * If memcg is bound to the default hierarchy, css of the memcg associated 36562306a36Sopenharmony_ci * with @folio is returned. The returned css remains associated with @folio 36662306a36Sopenharmony_ci * until it is released. 36762306a36Sopenharmony_ci * 36862306a36Sopenharmony_ci * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 36962306a36Sopenharmony_ci * is returned. 37062306a36Sopenharmony_ci */ 37162306a36Sopenharmony_cistruct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) 37262306a36Sopenharmony_ci{ 37362306a36Sopenharmony_ci struct mem_cgroup *memcg = folio_memcg(folio); 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 37662306a36Sopenharmony_ci memcg = root_mem_cgroup; 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci return &memcg->css; 37962306a36Sopenharmony_ci} 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci/** 38262306a36Sopenharmony_ci * page_cgroup_ino - return inode number of the memcg a page is charged to 38362306a36Sopenharmony_ci * @page: the page 38462306a36Sopenharmony_ci * 38562306a36Sopenharmony_ci * Look up the closest online ancestor of the memory cgroup @page is charged to 38662306a36Sopenharmony_ci * and return its inode number or 0 if @page is not charged to any cgroup. It 38762306a36Sopenharmony_ci * is safe to call this function without holding a reference to @page. 38862306a36Sopenharmony_ci * 38962306a36Sopenharmony_ci * Note, this function is inherently racy, because there is nothing to prevent 39062306a36Sopenharmony_ci * the cgroup inode from getting torn down and potentially reallocated a moment 39162306a36Sopenharmony_ci * after page_cgroup_ino() returns, so it only should be used by callers that 39262306a36Sopenharmony_ci * do not care (such as procfs interfaces). 39362306a36Sopenharmony_ci */ 39462306a36Sopenharmony_ciino_t page_cgroup_ino(struct page *page) 39562306a36Sopenharmony_ci{ 39662306a36Sopenharmony_ci struct mem_cgroup *memcg; 39762306a36Sopenharmony_ci unsigned long ino = 0; 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci rcu_read_lock(); 40062306a36Sopenharmony_ci /* page_folio() is racy here, but the entire function is racy anyway */ 40162306a36Sopenharmony_ci memcg = folio_memcg_check(page_folio(page)); 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci while (memcg && !(memcg->css.flags & CSS_ONLINE)) 40462306a36Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 40562306a36Sopenharmony_ci if (memcg) 40662306a36Sopenharmony_ci ino = cgroup_ino(memcg->css.cgroup); 40762306a36Sopenharmony_ci rcu_read_unlock(); 40862306a36Sopenharmony_ci return ino; 40962306a36Sopenharmony_ci} 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_cistatic void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 41262306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz, 41362306a36Sopenharmony_ci unsigned long new_usage_in_excess) 41462306a36Sopenharmony_ci{ 41562306a36Sopenharmony_ci struct rb_node **p = &mctz->rb_root.rb_node; 41662306a36Sopenharmony_ci struct rb_node *parent = NULL; 41762306a36Sopenharmony_ci struct mem_cgroup_per_node *mz_node; 41862306a36Sopenharmony_ci bool rightmost = true; 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci if (mz->on_tree) 42162306a36Sopenharmony_ci return; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci mz->usage_in_excess = new_usage_in_excess; 42462306a36Sopenharmony_ci if (!mz->usage_in_excess) 42562306a36Sopenharmony_ci return; 42662306a36Sopenharmony_ci while (*p) { 42762306a36Sopenharmony_ci parent = *p; 42862306a36Sopenharmony_ci mz_node = rb_entry(parent, struct mem_cgroup_per_node, 42962306a36Sopenharmony_ci tree_node); 43062306a36Sopenharmony_ci if (mz->usage_in_excess < mz_node->usage_in_excess) { 43162306a36Sopenharmony_ci p = &(*p)->rb_left; 43262306a36Sopenharmony_ci rightmost = false; 43362306a36Sopenharmony_ci } else { 43462306a36Sopenharmony_ci p = &(*p)->rb_right; 43562306a36Sopenharmony_ci } 43662306a36Sopenharmony_ci } 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci if (rightmost) 43962306a36Sopenharmony_ci mctz->rb_rightmost = &mz->tree_node; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci rb_link_node(&mz->tree_node, parent, p); 44262306a36Sopenharmony_ci rb_insert_color(&mz->tree_node, &mctz->rb_root); 44362306a36Sopenharmony_ci mz->on_tree = true; 44462306a36Sopenharmony_ci} 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_cistatic void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 44762306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz) 44862306a36Sopenharmony_ci{ 44962306a36Sopenharmony_ci if (!mz->on_tree) 45062306a36Sopenharmony_ci return; 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci if (&mz->tree_node == mctz->rb_rightmost) 45362306a36Sopenharmony_ci mctz->rb_rightmost = rb_prev(&mz->tree_node); 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci rb_erase(&mz->tree_node, &mctz->rb_root); 45662306a36Sopenharmony_ci mz->on_tree = false; 45762306a36Sopenharmony_ci} 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_cistatic void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 46062306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz) 46162306a36Sopenharmony_ci{ 46262306a36Sopenharmony_ci unsigned long flags; 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci spin_lock_irqsave(&mctz->lock, flags); 46562306a36Sopenharmony_ci __mem_cgroup_remove_exceeded(mz, mctz); 46662306a36Sopenharmony_ci spin_unlock_irqrestore(&mctz->lock, flags); 46762306a36Sopenharmony_ci} 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_cistatic unsigned long soft_limit_excess(struct mem_cgroup *memcg) 47062306a36Sopenharmony_ci{ 47162306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 47262306a36Sopenharmony_ci struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0); 47362306a36Sopenharmony_ci struct lruvec *lruvec = &mz->lruvec; 47462306a36Sopenharmony_ci unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, 47562306a36Sopenharmony_ci MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, 47662306a36Sopenharmony_ci MAX_NR_ZONES); 47762306a36Sopenharmony_ci#else 47862306a36Sopenharmony_ci unsigned long nr_pages = page_counter_read(&memcg->memory); 47962306a36Sopenharmony_ci#endif 48062306a36Sopenharmony_ci unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 48162306a36Sopenharmony_ci unsigned long excess = 0; 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci if (nr_pages > soft_limit) 48462306a36Sopenharmony_ci excess = nr_pages - soft_limit; 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci return excess; 48762306a36Sopenharmony_ci} 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_cistatic void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) 49062306a36Sopenharmony_ci{ 49162306a36Sopenharmony_ci unsigned long excess; 49262306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 49362306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci if (lru_gen_enabled()) { 49662306a36Sopenharmony_ci if (soft_limit_excess(memcg)) 49762306a36Sopenharmony_ci lru_gen_soft_reclaim(memcg, nid); 49862306a36Sopenharmony_ci return; 49962306a36Sopenharmony_ci } 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci mctz = soft_limit_tree.rb_tree_per_node[nid]; 50262306a36Sopenharmony_ci if (!mctz) 50362306a36Sopenharmony_ci return; 50462306a36Sopenharmony_ci /* 50562306a36Sopenharmony_ci * Necessary to update all ancestors when hierarchy is used. 50662306a36Sopenharmony_ci * because their event counter is not touched. 50762306a36Sopenharmony_ci */ 50862306a36Sopenharmony_ci for (; memcg; memcg = parent_mem_cgroup(memcg)) { 50962306a36Sopenharmony_ci mz = memcg->nodeinfo[nid]; 51062306a36Sopenharmony_ci excess = soft_limit_excess(memcg); 51162306a36Sopenharmony_ci /* 51262306a36Sopenharmony_ci * We have to update the tree if mz is on RB-tree or 51362306a36Sopenharmony_ci * mem is over its softlimit. 51462306a36Sopenharmony_ci */ 51562306a36Sopenharmony_ci if (excess || mz->on_tree) { 51662306a36Sopenharmony_ci unsigned long flags; 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci spin_lock_irqsave(&mctz->lock, flags); 51962306a36Sopenharmony_ci /* if on-tree, remove it */ 52062306a36Sopenharmony_ci if (mz->on_tree) 52162306a36Sopenharmony_ci __mem_cgroup_remove_exceeded(mz, mctz); 52262306a36Sopenharmony_ci /* 52362306a36Sopenharmony_ci * Insert again. mz->usage_in_excess will be updated. 52462306a36Sopenharmony_ci * If excess is 0, no tree ops. 52562306a36Sopenharmony_ci */ 52662306a36Sopenharmony_ci __mem_cgroup_insert_exceeded(mz, mctz, excess); 52762306a36Sopenharmony_ci spin_unlock_irqrestore(&mctz->lock, flags); 52862306a36Sopenharmony_ci } 52962306a36Sopenharmony_ci } 53062306a36Sopenharmony_ci} 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_cistatic void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 53362306a36Sopenharmony_ci{ 53462306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz; 53562306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 53662306a36Sopenharmony_ci int nid; 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci for_each_node(nid) { 53962306a36Sopenharmony_ci mz = memcg->nodeinfo[nid]; 54062306a36Sopenharmony_ci mctz = soft_limit_tree.rb_tree_per_node[nid]; 54162306a36Sopenharmony_ci if (mctz) 54262306a36Sopenharmony_ci mem_cgroup_remove_exceeded(mz, mctz); 54362306a36Sopenharmony_ci } 54462306a36Sopenharmony_ci} 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_cistatic struct mem_cgroup_per_node * 54762306a36Sopenharmony_ci__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 54862306a36Sopenharmony_ci{ 54962306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ciretry: 55262306a36Sopenharmony_ci mz = NULL; 55362306a36Sopenharmony_ci if (!mctz->rb_rightmost) 55462306a36Sopenharmony_ci goto done; /* Nothing to reclaim from */ 55562306a36Sopenharmony_ci 55662306a36Sopenharmony_ci mz = rb_entry(mctz->rb_rightmost, 55762306a36Sopenharmony_ci struct mem_cgroup_per_node, tree_node); 55862306a36Sopenharmony_ci /* 55962306a36Sopenharmony_ci * Remove the node now but someone else can add it back, 56062306a36Sopenharmony_ci * we will to add it back at the end of reclaim to its correct 56162306a36Sopenharmony_ci * position in the tree. 56262306a36Sopenharmony_ci */ 56362306a36Sopenharmony_ci __mem_cgroup_remove_exceeded(mz, mctz); 56462306a36Sopenharmony_ci if (!soft_limit_excess(mz->memcg) || 56562306a36Sopenharmony_ci !css_tryget(&mz->memcg->css)) 56662306a36Sopenharmony_ci goto retry; 56762306a36Sopenharmony_cidone: 56862306a36Sopenharmony_ci return mz; 56962306a36Sopenharmony_ci} 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_cistatic struct mem_cgroup_per_node * 57262306a36Sopenharmony_cimem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 57362306a36Sopenharmony_ci{ 57462306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 57562306a36Sopenharmony_ci 57662306a36Sopenharmony_ci spin_lock_irq(&mctz->lock); 57762306a36Sopenharmony_ci mz = __mem_cgroup_largest_soft_limit_node(mctz); 57862306a36Sopenharmony_ci spin_unlock_irq(&mctz->lock); 57962306a36Sopenharmony_ci return mz; 58062306a36Sopenharmony_ci} 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci/* 58362306a36Sopenharmony_ci * memcg and lruvec stats flushing 58462306a36Sopenharmony_ci * 58562306a36Sopenharmony_ci * Many codepaths leading to stats update or read are performance sensitive and 58662306a36Sopenharmony_ci * adding stats flushing in such codepaths is not desirable. So, to optimize the 58762306a36Sopenharmony_ci * flushing the kernel does: 58862306a36Sopenharmony_ci * 58962306a36Sopenharmony_ci * 1) Periodically and asynchronously flush the stats every 2 seconds to not let 59062306a36Sopenharmony_ci * rstat update tree grow unbounded. 59162306a36Sopenharmony_ci * 59262306a36Sopenharmony_ci * 2) Flush the stats synchronously on reader side only when there are more than 59362306a36Sopenharmony_ci * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization 59462306a36Sopenharmony_ci * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but 59562306a36Sopenharmony_ci * only for 2 seconds due to (1). 59662306a36Sopenharmony_ci */ 59762306a36Sopenharmony_cistatic void flush_memcg_stats_dwork(struct work_struct *w); 59862306a36Sopenharmony_cistatic DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); 59962306a36Sopenharmony_cistatic DEFINE_PER_CPU(unsigned int, stats_updates); 60062306a36Sopenharmony_cistatic atomic_t stats_flush_ongoing = ATOMIC_INIT(0); 60162306a36Sopenharmony_cistatic atomic_t stats_flush_threshold = ATOMIC_INIT(0); 60262306a36Sopenharmony_cistatic u64 flush_next_time; 60362306a36Sopenharmony_ci 60462306a36Sopenharmony_ci#define FLUSH_TIME (2UL*HZ) 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci/* 60762306a36Sopenharmony_ci * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can 60862306a36Sopenharmony_ci * not rely on this as part of an acquired spinlock_t lock. These functions are 60962306a36Sopenharmony_ci * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion 61062306a36Sopenharmony_ci * is sufficient. 61162306a36Sopenharmony_ci */ 61262306a36Sopenharmony_cistatic void memcg_stats_lock(void) 61362306a36Sopenharmony_ci{ 61462306a36Sopenharmony_ci preempt_disable_nested(); 61562306a36Sopenharmony_ci VM_WARN_ON_IRQS_ENABLED(); 61662306a36Sopenharmony_ci} 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_cistatic void __memcg_stats_lock(void) 61962306a36Sopenharmony_ci{ 62062306a36Sopenharmony_ci preempt_disable_nested(); 62162306a36Sopenharmony_ci} 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_cistatic void memcg_stats_unlock(void) 62462306a36Sopenharmony_ci{ 62562306a36Sopenharmony_ci preempt_enable_nested(); 62662306a36Sopenharmony_ci} 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_cistatic inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) 62962306a36Sopenharmony_ci{ 63062306a36Sopenharmony_ci unsigned int x; 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci if (!val) 63362306a36Sopenharmony_ci return; 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci x = __this_cpu_add_return(stats_updates, abs(val)); 63862306a36Sopenharmony_ci if (x > MEMCG_CHARGE_BATCH) { 63962306a36Sopenharmony_ci /* 64062306a36Sopenharmony_ci * If stats_flush_threshold exceeds the threshold 64162306a36Sopenharmony_ci * (>num_online_cpus()), cgroup stats update will be triggered 64262306a36Sopenharmony_ci * in __mem_cgroup_flush_stats(). Increasing this var further 64362306a36Sopenharmony_ci * is redundant and simply adds overhead in atomic update. 64462306a36Sopenharmony_ci */ 64562306a36Sopenharmony_ci if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) 64662306a36Sopenharmony_ci atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); 64762306a36Sopenharmony_ci __this_cpu_write(stats_updates, 0); 64862306a36Sopenharmony_ci } 64962306a36Sopenharmony_ci} 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_cistatic void do_flush_stats(void) 65262306a36Sopenharmony_ci{ 65362306a36Sopenharmony_ci /* 65462306a36Sopenharmony_ci * We always flush the entire tree, so concurrent flushers can just 65562306a36Sopenharmony_ci * skip. This avoids a thundering herd problem on the rstat global lock 65662306a36Sopenharmony_ci * from memcg flushers (e.g. reclaim, refault, etc). 65762306a36Sopenharmony_ci */ 65862306a36Sopenharmony_ci if (atomic_read(&stats_flush_ongoing) || 65962306a36Sopenharmony_ci atomic_xchg(&stats_flush_ongoing, 1)) 66062306a36Sopenharmony_ci return; 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_ci WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ci cgroup_rstat_flush(root_mem_cgroup->css.cgroup); 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci atomic_set(&stats_flush_threshold, 0); 66762306a36Sopenharmony_ci atomic_set(&stats_flush_ongoing, 0); 66862306a36Sopenharmony_ci} 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_civoid mem_cgroup_flush_stats(void) 67162306a36Sopenharmony_ci{ 67262306a36Sopenharmony_ci if (atomic_read(&stats_flush_threshold) > num_online_cpus()) 67362306a36Sopenharmony_ci do_flush_stats(); 67462306a36Sopenharmony_ci} 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_civoid mem_cgroup_flush_stats_ratelimited(void) 67762306a36Sopenharmony_ci{ 67862306a36Sopenharmony_ci if (time_after64(jiffies_64, READ_ONCE(flush_next_time))) 67962306a36Sopenharmony_ci mem_cgroup_flush_stats(); 68062306a36Sopenharmony_ci} 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_cistatic void flush_memcg_stats_dwork(struct work_struct *w) 68362306a36Sopenharmony_ci{ 68462306a36Sopenharmony_ci /* 68562306a36Sopenharmony_ci * Always flush here so that flushing in latency-sensitive paths is 68662306a36Sopenharmony_ci * as cheap as possible. 68762306a36Sopenharmony_ci */ 68862306a36Sopenharmony_ci do_flush_stats(); 68962306a36Sopenharmony_ci queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); 69062306a36Sopenharmony_ci} 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci/* Subset of vm_event_item to report for memcg event stats */ 69362306a36Sopenharmony_cistatic const unsigned int memcg_vm_event_stat[] = { 69462306a36Sopenharmony_ci PGPGIN, 69562306a36Sopenharmony_ci PGPGOUT, 69662306a36Sopenharmony_ci PGSCAN_KSWAPD, 69762306a36Sopenharmony_ci PGSCAN_DIRECT, 69862306a36Sopenharmony_ci PGSCAN_KHUGEPAGED, 69962306a36Sopenharmony_ci PGSTEAL_KSWAPD, 70062306a36Sopenharmony_ci PGSTEAL_DIRECT, 70162306a36Sopenharmony_ci PGSTEAL_KHUGEPAGED, 70262306a36Sopenharmony_ci PGFAULT, 70362306a36Sopenharmony_ci PGMAJFAULT, 70462306a36Sopenharmony_ci PGREFILL, 70562306a36Sopenharmony_ci PGACTIVATE, 70662306a36Sopenharmony_ci PGDEACTIVATE, 70762306a36Sopenharmony_ci PGLAZYFREE, 70862306a36Sopenharmony_ci PGLAZYFREED, 70962306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 71062306a36Sopenharmony_ci ZSWPIN, 71162306a36Sopenharmony_ci ZSWPOUT, 71262306a36Sopenharmony_ci#endif 71362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 71462306a36Sopenharmony_ci THP_FAULT_ALLOC, 71562306a36Sopenharmony_ci THP_COLLAPSE_ALLOC, 71662306a36Sopenharmony_ci#endif 71762306a36Sopenharmony_ci}; 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) 72062306a36Sopenharmony_cistatic int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_cistatic void init_memcg_events(void) 72362306a36Sopenharmony_ci{ 72462306a36Sopenharmony_ci int i; 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci for (i = 0; i < NR_MEMCG_EVENTS; ++i) 72762306a36Sopenharmony_ci mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; 72862306a36Sopenharmony_ci} 72962306a36Sopenharmony_ci 73062306a36Sopenharmony_cistatic inline int memcg_events_index(enum vm_event_item idx) 73162306a36Sopenharmony_ci{ 73262306a36Sopenharmony_ci return mem_cgroup_events_index[idx] - 1; 73362306a36Sopenharmony_ci} 73462306a36Sopenharmony_ci 73562306a36Sopenharmony_cistruct memcg_vmstats_percpu { 73662306a36Sopenharmony_ci /* Local (CPU and cgroup) page state & events */ 73762306a36Sopenharmony_ci long state[MEMCG_NR_STAT]; 73862306a36Sopenharmony_ci unsigned long events[NR_MEMCG_EVENTS]; 73962306a36Sopenharmony_ci 74062306a36Sopenharmony_ci /* Delta calculation for lockless upward propagation */ 74162306a36Sopenharmony_ci long state_prev[MEMCG_NR_STAT]; 74262306a36Sopenharmony_ci unsigned long events_prev[NR_MEMCG_EVENTS]; 74362306a36Sopenharmony_ci 74462306a36Sopenharmony_ci /* Cgroup1: threshold notifications & softlimit tree updates */ 74562306a36Sopenharmony_ci unsigned long nr_page_events; 74662306a36Sopenharmony_ci unsigned long targets[MEM_CGROUP_NTARGETS]; 74762306a36Sopenharmony_ci}; 74862306a36Sopenharmony_ci 74962306a36Sopenharmony_cistruct memcg_vmstats { 75062306a36Sopenharmony_ci /* Aggregated (CPU and subtree) page state & events */ 75162306a36Sopenharmony_ci long state[MEMCG_NR_STAT]; 75262306a36Sopenharmony_ci unsigned long events[NR_MEMCG_EVENTS]; 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci /* Non-hierarchical (CPU aggregated) page state & events */ 75562306a36Sopenharmony_ci long state_local[MEMCG_NR_STAT]; 75662306a36Sopenharmony_ci unsigned long events_local[NR_MEMCG_EVENTS]; 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_ci /* Pending child counts during tree propagation */ 75962306a36Sopenharmony_ci long state_pending[MEMCG_NR_STAT]; 76062306a36Sopenharmony_ci unsigned long events_pending[NR_MEMCG_EVENTS]; 76162306a36Sopenharmony_ci}; 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ciunsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) 76462306a36Sopenharmony_ci{ 76562306a36Sopenharmony_ci long x = READ_ONCE(memcg->vmstats->state[idx]); 76662306a36Sopenharmony_ci#ifdef CONFIG_SMP 76762306a36Sopenharmony_ci if (x < 0) 76862306a36Sopenharmony_ci x = 0; 76962306a36Sopenharmony_ci#endif 77062306a36Sopenharmony_ci return x; 77162306a36Sopenharmony_ci} 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_ci/** 77462306a36Sopenharmony_ci * __mod_memcg_state - update cgroup memory statistics 77562306a36Sopenharmony_ci * @memcg: the memory cgroup 77662306a36Sopenharmony_ci * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 77762306a36Sopenharmony_ci * @val: delta to add to the counter, can be negative 77862306a36Sopenharmony_ci */ 77962306a36Sopenharmony_civoid __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 78062306a36Sopenharmony_ci{ 78162306a36Sopenharmony_ci if (mem_cgroup_disabled()) 78262306a36Sopenharmony_ci return; 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 78562306a36Sopenharmony_ci memcg_rstat_updated(memcg, val); 78662306a36Sopenharmony_ci} 78762306a36Sopenharmony_ci 78862306a36Sopenharmony_ci/* idx can be of type enum memcg_stat_item or node_stat_item. */ 78962306a36Sopenharmony_cistatic unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 79062306a36Sopenharmony_ci{ 79162306a36Sopenharmony_ci long x = READ_ONCE(memcg->vmstats->state_local[idx]); 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_ci#ifdef CONFIG_SMP 79462306a36Sopenharmony_ci if (x < 0) 79562306a36Sopenharmony_ci x = 0; 79662306a36Sopenharmony_ci#endif 79762306a36Sopenharmony_ci return x; 79862306a36Sopenharmony_ci} 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_civoid __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 80162306a36Sopenharmony_ci int val) 80262306a36Sopenharmony_ci{ 80362306a36Sopenharmony_ci struct mem_cgroup_per_node *pn; 80462306a36Sopenharmony_ci struct mem_cgroup *memcg; 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 80762306a36Sopenharmony_ci memcg = pn->memcg; 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_ci /* 81062306a36Sopenharmony_ci * The caller from rmap relay on disabled preemption becase they never 81162306a36Sopenharmony_ci * update their counter from in-interrupt context. For these two 81262306a36Sopenharmony_ci * counters we check that the update is never performed from an 81362306a36Sopenharmony_ci * interrupt context while other caller need to have disabled interrupt. 81462306a36Sopenharmony_ci */ 81562306a36Sopenharmony_ci __memcg_stats_lock(); 81662306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_DEBUG_VM)) { 81762306a36Sopenharmony_ci switch (idx) { 81862306a36Sopenharmony_ci case NR_ANON_MAPPED: 81962306a36Sopenharmony_ci case NR_FILE_MAPPED: 82062306a36Sopenharmony_ci case NR_ANON_THPS: 82162306a36Sopenharmony_ci case NR_SHMEM_PMDMAPPED: 82262306a36Sopenharmony_ci case NR_FILE_PMDMAPPED: 82362306a36Sopenharmony_ci WARN_ON_ONCE(!in_task()); 82462306a36Sopenharmony_ci break; 82562306a36Sopenharmony_ci default: 82662306a36Sopenharmony_ci VM_WARN_ON_IRQS_ENABLED(); 82762306a36Sopenharmony_ci } 82862306a36Sopenharmony_ci } 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci /* Update memcg */ 83162306a36Sopenharmony_ci __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 83262306a36Sopenharmony_ci 83362306a36Sopenharmony_ci /* Update lruvec */ 83462306a36Sopenharmony_ci __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci memcg_rstat_updated(memcg, val); 83762306a36Sopenharmony_ci memcg_stats_unlock(); 83862306a36Sopenharmony_ci} 83962306a36Sopenharmony_ci 84062306a36Sopenharmony_ci/** 84162306a36Sopenharmony_ci * __mod_lruvec_state - update lruvec memory statistics 84262306a36Sopenharmony_ci * @lruvec: the lruvec 84362306a36Sopenharmony_ci * @idx: the stat item 84462306a36Sopenharmony_ci * @val: delta to add to the counter, can be negative 84562306a36Sopenharmony_ci * 84662306a36Sopenharmony_ci * The lruvec is the intersection of the NUMA node and a cgroup. This 84762306a36Sopenharmony_ci * function updates the all three counters that are affected by a 84862306a36Sopenharmony_ci * change of state at this level: per-node, per-cgroup, per-lruvec. 84962306a36Sopenharmony_ci */ 85062306a36Sopenharmony_civoid __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 85162306a36Sopenharmony_ci int val) 85262306a36Sopenharmony_ci{ 85362306a36Sopenharmony_ci /* Update node */ 85462306a36Sopenharmony_ci __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci /* Update memcg and lruvec */ 85762306a36Sopenharmony_ci if (!mem_cgroup_disabled()) { 85862306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 85962306a36Sopenharmony_ci if (is_node_lruvec(lruvec)) 86062306a36Sopenharmony_ci return; 86162306a36Sopenharmony_ci#endif 86262306a36Sopenharmony_ci __mod_memcg_lruvec_state(lruvec, idx, val); 86362306a36Sopenharmony_ci } 86462306a36Sopenharmony_ci} 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_civoid __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 86762306a36Sopenharmony_ci int val) 86862306a36Sopenharmony_ci{ 86962306a36Sopenharmony_ci struct page *head = compound_head(page); /* rmap on tail pages */ 87062306a36Sopenharmony_ci struct mem_cgroup *memcg; 87162306a36Sopenharmony_ci pg_data_t *pgdat = page_pgdat(page); 87262306a36Sopenharmony_ci struct lruvec *lruvec; 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 87562306a36Sopenharmony_ci if (is_file_page(page) && !is_prot_page(page)) { 87662306a36Sopenharmony_ci __mod_node_page_state(pgdat, idx, val); 87762306a36Sopenharmony_ci return; 87862306a36Sopenharmony_ci } 87962306a36Sopenharmony_ci#endif 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci rcu_read_lock(); 88262306a36Sopenharmony_ci memcg = page_memcg(head); 88362306a36Sopenharmony_ci /* Untracked pages have no memcg, no lruvec. Update only the node */ 88462306a36Sopenharmony_ci if (!memcg) { 88562306a36Sopenharmony_ci rcu_read_unlock(); 88662306a36Sopenharmony_ci __mod_node_page_state(pgdat, idx, val); 88762306a36Sopenharmony_ci return; 88862306a36Sopenharmony_ci } 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci lruvec = mem_cgroup_lruvec(memcg, pgdat); 89162306a36Sopenharmony_ci __mod_lruvec_state(lruvec, idx, val); 89262306a36Sopenharmony_ci rcu_read_unlock(); 89362306a36Sopenharmony_ci} 89462306a36Sopenharmony_ciEXPORT_SYMBOL(__mod_lruvec_page_state); 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_civoid __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 89762306a36Sopenharmony_ci{ 89862306a36Sopenharmony_ci pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 89962306a36Sopenharmony_ci struct mem_cgroup *memcg; 90062306a36Sopenharmony_ci struct lruvec *lruvec; 90162306a36Sopenharmony_ci 90262306a36Sopenharmony_ci rcu_read_lock(); 90362306a36Sopenharmony_ci memcg = mem_cgroup_from_slab_obj(p); 90462306a36Sopenharmony_ci 90562306a36Sopenharmony_ci /* 90662306a36Sopenharmony_ci * Untracked pages have no memcg, no lruvec. Update only the 90762306a36Sopenharmony_ci * node. If we reparent the slab objects to the root memcg, 90862306a36Sopenharmony_ci * when we free the slab object, we need to update the per-memcg 90962306a36Sopenharmony_ci * vmstats to keep it correct for the root memcg. 91062306a36Sopenharmony_ci */ 91162306a36Sopenharmony_ci if (!memcg) { 91262306a36Sopenharmony_ci __mod_node_page_state(pgdat, idx, val); 91362306a36Sopenharmony_ci } else { 91462306a36Sopenharmony_ci lruvec = mem_cgroup_lruvec(memcg, pgdat); 91562306a36Sopenharmony_ci __mod_lruvec_state(lruvec, idx, val); 91662306a36Sopenharmony_ci } 91762306a36Sopenharmony_ci rcu_read_unlock(); 91862306a36Sopenharmony_ci} 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci/** 92162306a36Sopenharmony_ci * __count_memcg_events - account VM events in a cgroup 92262306a36Sopenharmony_ci * @memcg: the memory cgroup 92362306a36Sopenharmony_ci * @idx: the event item 92462306a36Sopenharmony_ci * @count: the number of events that occurred 92562306a36Sopenharmony_ci */ 92662306a36Sopenharmony_civoid __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 92762306a36Sopenharmony_ci unsigned long count) 92862306a36Sopenharmony_ci{ 92962306a36Sopenharmony_ci int index = memcg_events_index(idx); 93062306a36Sopenharmony_ci 93162306a36Sopenharmony_ci if (mem_cgroup_disabled() || index < 0) 93262306a36Sopenharmony_ci return; 93362306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 93462306a36Sopenharmony_ci if (!memcg) 93562306a36Sopenharmony_ci return; 93662306a36Sopenharmony_ci#endif 93762306a36Sopenharmony_ci 93862306a36Sopenharmony_ci memcg_stats_lock(); 93962306a36Sopenharmony_ci __this_cpu_add(memcg->vmstats_percpu->events[index], count); 94062306a36Sopenharmony_ci memcg_rstat_updated(memcg, count); 94162306a36Sopenharmony_ci memcg_stats_unlock(); 94262306a36Sopenharmony_ci} 94362306a36Sopenharmony_ci 94462306a36Sopenharmony_cistatic unsigned long memcg_events(struct mem_cgroup *memcg, int event) 94562306a36Sopenharmony_ci{ 94662306a36Sopenharmony_ci int index = memcg_events_index(event); 94762306a36Sopenharmony_ci 94862306a36Sopenharmony_ci if (index < 0) 94962306a36Sopenharmony_ci return 0; 95062306a36Sopenharmony_ci return READ_ONCE(memcg->vmstats->events[index]); 95162306a36Sopenharmony_ci} 95262306a36Sopenharmony_ci 95362306a36Sopenharmony_cistatic unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 95462306a36Sopenharmony_ci{ 95562306a36Sopenharmony_ci int index = memcg_events_index(event); 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ci if (index < 0) 95862306a36Sopenharmony_ci return 0; 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci return READ_ONCE(memcg->vmstats->events_local[index]); 96162306a36Sopenharmony_ci} 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_cistatic void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 96462306a36Sopenharmony_ci int nr_pages) 96562306a36Sopenharmony_ci{ 96662306a36Sopenharmony_ci /* pagein of a big page is an event. So, ignore page size */ 96762306a36Sopenharmony_ci if (nr_pages > 0) 96862306a36Sopenharmony_ci __count_memcg_events(memcg, PGPGIN, 1); 96962306a36Sopenharmony_ci else { 97062306a36Sopenharmony_ci __count_memcg_events(memcg, PGPGOUT, 1); 97162306a36Sopenharmony_ci nr_pages = -nr_pages; /* for event */ 97262306a36Sopenharmony_ci } 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 97562306a36Sopenharmony_ci} 97662306a36Sopenharmony_ci 97762306a36Sopenharmony_cistatic bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 97862306a36Sopenharmony_ci enum mem_cgroup_events_target target) 97962306a36Sopenharmony_ci{ 98062306a36Sopenharmony_ci unsigned long val, next; 98162306a36Sopenharmony_ci 98262306a36Sopenharmony_ci val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 98362306a36Sopenharmony_ci next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 98462306a36Sopenharmony_ci /* from time_after() in jiffies.h */ 98562306a36Sopenharmony_ci if ((long)(next - val) < 0) { 98662306a36Sopenharmony_ci switch (target) { 98762306a36Sopenharmony_ci case MEM_CGROUP_TARGET_THRESH: 98862306a36Sopenharmony_ci next = val + THRESHOLDS_EVENTS_TARGET; 98962306a36Sopenharmony_ci break; 99062306a36Sopenharmony_ci case MEM_CGROUP_TARGET_SOFTLIMIT: 99162306a36Sopenharmony_ci next = val + SOFTLIMIT_EVENTS_TARGET; 99262306a36Sopenharmony_ci break; 99362306a36Sopenharmony_ci default: 99462306a36Sopenharmony_ci break; 99562306a36Sopenharmony_ci } 99662306a36Sopenharmony_ci __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 99762306a36Sopenharmony_ci return true; 99862306a36Sopenharmony_ci } 99962306a36Sopenharmony_ci return false; 100062306a36Sopenharmony_ci} 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_ci/* 100362306a36Sopenharmony_ci * Check events in order. 100462306a36Sopenharmony_ci * 100562306a36Sopenharmony_ci */ 100662306a36Sopenharmony_cistatic void memcg_check_events(struct mem_cgroup *memcg, int nid) 100762306a36Sopenharmony_ci{ 100862306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_PREEMPT_RT)) 100962306a36Sopenharmony_ci return; 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci /* threshold event is triggered in finer grain than soft limit */ 101262306a36Sopenharmony_ci if (unlikely(mem_cgroup_event_ratelimit(memcg, 101362306a36Sopenharmony_ci MEM_CGROUP_TARGET_THRESH))) { 101462306a36Sopenharmony_ci bool do_softlimit; 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci do_softlimit = mem_cgroup_event_ratelimit(memcg, 101762306a36Sopenharmony_ci MEM_CGROUP_TARGET_SOFTLIMIT); 101862306a36Sopenharmony_ci mem_cgroup_threshold(memcg); 101962306a36Sopenharmony_ci if (unlikely(do_softlimit)) 102062306a36Sopenharmony_ci mem_cgroup_update_tree(memcg, nid); 102162306a36Sopenharmony_ci } 102262306a36Sopenharmony_ci} 102362306a36Sopenharmony_ci 102462306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 102562306a36Sopenharmony_ci{ 102662306a36Sopenharmony_ci /* 102762306a36Sopenharmony_ci * mm_update_next_owner() may clear mm->owner to NULL 102862306a36Sopenharmony_ci * if it races with swapoff, page migration, etc. 102962306a36Sopenharmony_ci * So this can be called with p == NULL. 103062306a36Sopenharmony_ci */ 103162306a36Sopenharmony_ci if (unlikely(!p)) 103262306a36Sopenharmony_ci return NULL; 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_ci return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 103562306a36Sopenharmony_ci} 103662306a36Sopenharmony_ciEXPORT_SYMBOL(mem_cgroup_from_task); 103762306a36Sopenharmony_ci 103862306a36Sopenharmony_cistatic __always_inline struct mem_cgroup *active_memcg(void) 103962306a36Sopenharmony_ci{ 104062306a36Sopenharmony_ci if (!in_task()) 104162306a36Sopenharmony_ci return this_cpu_read(int_active_memcg); 104262306a36Sopenharmony_ci else 104362306a36Sopenharmony_ci return current->active_memcg; 104462306a36Sopenharmony_ci} 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci/** 104762306a36Sopenharmony_ci * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 104862306a36Sopenharmony_ci * @mm: mm from which memcg should be extracted. It can be NULL. 104962306a36Sopenharmony_ci * 105062306a36Sopenharmony_ci * Obtain a reference on mm->memcg and returns it if successful. If mm 105162306a36Sopenharmony_ci * is NULL, then the memcg is chosen as follows: 105262306a36Sopenharmony_ci * 1) The active memcg, if set. 105362306a36Sopenharmony_ci * 2) current->mm->memcg, if available 105462306a36Sopenharmony_ci * 3) root memcg 105562306a36Sopenharmony_ci * If mem_cgroup is disabled, NULL is returned. 105662306a36Sopenharmony_ci */ 105762306a36Sopenharmony_cistruct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 105862306a36Sopenharmony_ci{ 105962306a36Sopenharmony_ci struct mem_cgroup *memcg; 106062306a36Sopenharmony_ci 106162306a36Sopenharmony_ci if (mem_cgroup_disabled()) 106262306a36Sopenharmony_ci return NULL; 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_ci /* 106562306a36Sopenharmony_ci * Page cache insertions can happen without an 106662306a36Sopenharmony_ci * actual mm context, e.g. during disk probing 106762306a36Sopenharmony_ci * on boot, loopback IO, acct() writes etc. 106862306a36Sopenharmony_ci * 106962306a36Sopenharmony_ci * No need to css_get on root memcg as the reference 107062306a36Sopenharmony_ci * counting is disabled on the root level in the 107162306a36Sopenharmony_ci * cgroup core. See CSS_NO_REF. 107262306a36Sopenharmony_ci */ 107362306a36Sopenharmony_ci if (unlikely(!mm)) { 107462306a36Sopenharmony_ci memcg = active_memcg(); 107562306a36Sopenharmony_ci if (unlikely(memcg)) { 107662306a36Sopenharmony_ci /* remote memcg must hold a ref */ 107762306a36Sopenharmony_ci css_get(&memcg->css); 107862306a36Sopenharmony_ci return memcg; 107962306a36Sopenharmony_ci } 108062306a36Sopenharmony_ci mm = current->mm; 108162306a36Sopenharmony_ci if (unlikely(!mm)) 108262306a36Sopenharmony_ci return root_mem_cgroup; 108362306a36Sopenharmony_ci } 108462306a36Sopenharmony_ci 108562306a36Sopenharmony_ci rcu_read_lock(); 108662306a36Sopenharmony_ci do { 108762306a36Sopenharmony_ci memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 108862306a36Sopenharmony_ci if (unlikely(!memcg)) 108962306a36Sopenharmony_ci memcg = root_mem_cgroup; 109062306a36Sopenharmony_ci } while (!css_tryget(&memcg->css)); 109162306a36Sopenharmony_ci rcu_read_unlock(); 109262306a36Sopenharmony_ci return memcg; 109362306a36Sopenharmony_ci} 109462306a36Sopenharmony_ciEXPORT_SYMBOL(get_mem_cgroup_from_mm); 109562306a36Sopenharmony_ci 109662306a36Sopenharmony_cistatic __always_inline bool memcg_kmem_bypass(void) 109762306a36Sopenharmony_ci{ 109862306a36Sopenharmony_ci /* Allow remote memcg charging from any context. */ 109962306a36Sopenharmony_ci if (unlikely(active_memcg())) 110062306a36Sopenharmony_ci return false; 110162306a36Sopenharmony_ci 110262306a36Sopenharmony_ci /* Memcg to charge can't be determined. */ 110362306a36Sopenharmony_ci if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) 110462306a36Sopenharmony_ci return true; 110562306a36Sopenharmony_ci 110662306a36Sopenharmony_ci return false; 110762306a36Sopenharmony_ci} 110862306a36Sopenharmony_ci 110962306a36Sopenharmony_ci/** 111062306a36Sopenharmony_ci * mem_cgroup_iter - iterate over memory cgroup hierarchy 111162306a36Sopenharmony_ci * @root: hierarchy root 111262306a36Sopenharmony_ci * @prev: previously returned memcg, NULL on first invocation 111362306a36Sopenharmony_ci * @reclaim: cookie for shared reclaim walks, NULL for full walks 111462306a36Sopenharmony_ci * 111562306a36Sopenharmony_ci * Returns references to children of the hierarchy below @root, or 111662306a36Sopenharmony_ci * @root itself, or %NULL after a full round-trip. 111762306a36Sopenharmony_ci * 111862306a36Sopenharmony_ci * Caller must pass the return value in @prev on subsequent 111962306a36Sopenharmony_ci * invocations for reference counting, or use mem_cgroup_iter_break() 112062306a36Sopenharmony_ci * to cancel a hierarchy walk before the round-trip is complete. 112162306a36Sopenharmony_ci * 112262306a36Sopenharmony_ci * Reclaimers can specify a node in @reclaim to divide up the memcgs 112362306a36Sopenharmony_ci * in the hierarchy among all concurrent reclaimers operating on the 112462306a36Sopenharmony_ci * same node. 112562306a36Sopenharmony_ci */ 112662306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 112762306a36Sopenharmony_ci struct mem_cgroup *prev, 112862306a36Sopenharmony_ci struct mem_cgroup_reclaim_cookie *reclaim) 112962306a36Sopenharmony_ci{ 113062306a36Sopenharmony_ci struct mem_cgroup_reclaim_iter *iter; 113162306a36Sopenharmony_ci struct cgroup_subsys_state *css = NULL; 113262306a36Sopenharmony_ci struct mem_cgroup *memcg = NULL; 113362306a36Sopenharmony_ci struct mem_cgroup *pos = NULL; 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci if (mem_cgroup_disabled()) 113662306a36Sopenharmony_ci return NULL; 113762306a36Sopenharmony_ci 113862306a36Sopenharmony_ci if (!root) 113962306a36Sopenharmony_ci root = root_mem_cgroup; 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_ci rcu_read_lock(); 114262306a36Sopenharmony_ci 114362306a36Sopenharmony_ci if (reclaim) { 114462306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 114562306a36Sopenharmony_ci 114662306a36Sopenharmony_ci mz = root->nodeinfo[reclaim->pgdat->node_id]; 114762306a36Sopenharmony_ci iter = &mz->iter; 114862306a36Sopenharmony_ci 114962306a36Sopenharmony_ci /* 115062306a36Sopenharmony_ci * On start, join the current reclaim iteration cycle. 115162306a36Sopenharmony_ci * Exit when a concurrent walker completes it. 115262306a36Sopenharmony_ci */ 115362306a36Sopenharmony_ci if (!prev) 115462306a36Sopenharmony_ci reclaim->generation = iter->generation; 115562306a36Sopenharmony_ci else if (reclaim->generation != iter->generation) 115662306a36Sopenharmony_ci goto out_unlock; 115762306a36Sopenharmony_ci 115862306a36Sopenharmony_ci while (1) { 115962306a36Sopenharmony_ci pos = READ_ONCE(iter->position); 116062306a36Sopenharmony_ci if (!pos || css_tryget(&pos->css)) 116162306a36Sopenharmony_ci break; 116262306a36Sopenharmony_ci /* 116362306a36Sopenharmony_ci * css reference reached zero, so iter->position will 116462306a36Sopenharmony_ci * be cleared by ->css_released. However, we should not 116562306a36Sopenharmony_ci * rely on this happening soon, because ->css_released 116662306a36Sopenharmony_ci * is called from a work queue, and by busy-waiting we 116762306a36Sopenharmony_ci * might block it. So we clear iter->position right 116862306a36Sopenharmony_ci * away. 116962306a36Sopenharmony_ci */ 117062306a36Sopenharmony_ci (void)cmpxchg(&iter->position, pos, NULL); 117162306a36Sopenharmony_ci } 117262306a36Sopenharmony_ci } else if (prev) { 117362306a36Sopenharmony_ci pos = prev; 117462306a36Sopenharmony_ci } 117562306a36Sopenharmony_ci 117662306a36Sopenharmony_ci if (pos) 117762306a36Sopenharmony_ci css = &pos->css; 117862306a36Sopenharmony_ci 117962306a36Sopenharmony_ci for (;;) { 118062306a36Sopenharmony_ci css = css_next_descendant_pre(css, &root->css); 118162306a36Sopenharmony_ci if (!css) { 118262306a36Sopenharmony_ci /* 118362306a36Sopenharmony_ci * Reclaimers share the hierarchy walk, and a 118462306a36Sopenharmony_ci * new one might jump in right at the end of 118562306a36Sopenharmony_ci * the hierarchy - make sure they see at least 118662306a36Sopenharmony_ci * one group and restart from the beginning. 118762306a36Sopenharmony_ci */ 118862306a36Sopenharmony_ci if (!prev) 118962306a36Sopenharmony_ci continue; 119062306a36Sopenharmony_ci break; 119162306a36Sopenharmony_ci } 119262306a36Sopenharmony_ci 119362306a36Sopenharmony_ci /* 119462306a36Sopenharmony_ci * Verify the css and acquire a reference. The root 119562306a36Sopenharmony_ci * is provided by the caller, so we know it's alive 119662306a36Sopenharmony_ci * and kicking, and don't take an extra reference. 119762306a36Sopenharmony_ci */ 119862306a36Sopenharmony_ci if (css == &root->css || css_tryget(css)) { 119962306a36Sopenharmony_ci memcg = mem_cgroup_from_css(css); 120062306a36Sopenharmony_ci break; 120162306a36Sopenharmony_ci } 120262306a36Sopenharmony_ci } 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci if (reclaim) { 120562306a36Sopenharmony_ci /* 120662306a36Sopenharmony_ci * The position could have already been updated by a competing 120762306a36Sopenharmony_ci * thread, so check that the value hasn't changed since we read 120862306a36Sopenharmony_ci * it to avoid reclaiming from the same cgroup twice. 120962306a36Sopenharmony_ci */ 121062306a36Sopenharmony_ci (void)cmpxchg(&iter->position, pos, memcg); 121162306a36Sopenharmony_ci 121262306a36Sopenharmony_ci if (pos) 121362306a36Sopenharmony_ci css_put(&pos->css); 121462306a36Sopenharmony_ci 121562306a36Sopenharmony_ci if (!memcg) 121662306a36Sopenharmony_ci iter->generation++; 121762306a36Sopenharmony_ci } 121862306a36Sopenharmony_ci 121962306a36Sopenharmony_ciout_unlock: 122062306a36Sopenharmony_ci rcu_read_unlock(); 122162306a36Sopenharmony_ci if (prev && prev != root) 122262306a36Sopenharmony_ci css_put(&prev->css); 122362306a36Sopenharmony_ci 122462306a36Sopenharmony_ci return memcg; 122562306a36Sopenharmony_ci} 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ci/** 122862306a36Sopenharmony_ci * mem_cgroup_iter_break - abort a hierarchy walk prematurely 122962306a36Sopenharmony_ci * @root: hierarchy root 123062306a36Sopenharmony_ci * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 123162306a36Sopenharmony_ci */ 123262306a36Sopenharmony_civoid mem_cgroup_iter_break(struct mem_cgroup *root, 123362306a36Sopenharmony_ci struct mem_cgroup *prev) 123462306a36Sopenharmony_ci{ 123562306a36Sopenharmony_ci if (!root) 123662306a36Sopenharmony_ci root = root_mem_cgroup; 123762306a36Sopenharmony_ci if (prev && prev != root) 123862306a36Sopenharmony_ci css_put(&prev->css); 123962306a36Sopenharmony_ci} 124062306a36Sopenharmony_ci 124162306a36Sopenharmony_cistatic void __invalidate_reclaim_iterators(struct mem_cgroup *from, 124262306a36Sopenharmony_ci struct mem_cgroup *dead_memcg) 124362306a36Sopenharmony_ci{ 124462306a36Sopenharmony_ci struct mem_cgroup_reclaim_iter *iter; 124562306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 124662306a36Sopenharmony_ci int nid; 124762306a36Sopenharmony_ci 124862306a36Sopenharmony_ci for_each_node(nid) { 124962306a36Sopenharmony_ci mz = from->nodeinfo[nid]; 125062306a36Sopenharmony_ci iter = &mz->iter; 125162306a36Sopenharmony_ci cmpxchg(&iter->position, dead_memcg, NULL); 125262306a36Sopenharmony_ci } 125362306a36Sopenharmony_ci} 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_cistatic void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 125662306a36Sopenharmony_ci{ 125762306a36Sopenharmony_ci struct mem_cgroup *memcg = dead_memcg; 125862306a36Sopenharmony_ci struct mem_cgroup *last; 125962306a36Sopenharmony_ci 126062306a36Sopenharmony_ci do { 126162306a36Sopenharmony_ci __invalidate_reclaim_iterators(memcg, dead_memcg); 126262306a36Sopenharmony_ci last = memcg; 126362306a36Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg))); 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_ci /* 126662306a36Sopenharmony_ci * When cgroup1 non-hierarchy mode is used, 126762306a36Sopenharmony_ci * parent_mem_cgroup() does not walk all the way up to the 126862306a36Sopenharmony_ci * cgroup root (root_mem_cgroup). So we have to handle 126962306a36Sopenharmony_ci * dead_memcg from cgroup root separately. 127062306a36Sopenharmony_ci */ 127162306a36Sopenharmony_ci if (!mem_cgroup_is_root(last)) 127262306a36Sopenharmony_ci __invalidate_reclaim_iterators(root_mem_cgroup, 127362306a36Sopenharmony_ci dead_memcg); 127462306a36Sopenharmony_ci} 127562306a36Sopenharmony_ci 127662306a36Sopenharmony_ci/** 127762306a36Sopenharmony_ci * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 127862306a36Sopenharmony_ci * @memcg: hierarchy root 127962306a36Sopenharmony_ci * @fn: function to call for each task 128062306a36Sopenharmony_ci * @arg: argument passed to @fn 128162306a36Sopenharmony_ci * 128262306a36Sopenharmony_ci * This function iterates over tasks attached to @memcg or to any of its 128362306a36Sopenharmony_ci * descendants and calls @fn for each task. If @fn returns a non-zero 128462306a36Sopenharmony_ci * value, the function breaks the iteration loop. Otherwise, it will iterate 128562306a36Sopenharmony_ci * over all tasks and return 0. 128662306a36Sopenharmony_ci * 128762306a36Sopenharmony_ci * This function must not be called for the root memory cgroup. 128862306a36Sopenharmony_ci */ 128962306a36Sopenharmony_civoid mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 129062306a36Sopenharmony_ci int (*fn)(struct task_struct *, void *), void *arg) 129162306a36Sopenharmony_ci{ 129262306a36Sopenharmony_ci struct mem_cgroup *iter; 129362306a36Sopenharmony_ci int ret = 0; 129462306a36Sopenharmony_ci 129562306a36Sopenharmony_ci BUG_ON(mem_cgroup_is_root(memcg)); 129662306a36Sopenharmony_ci 129762306a36Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) { 129862306a36Sopenharmony_ci struct css_task_iter it; 129962306a36Sopenharmony_ci struct task_struct *task; 130062306a36Sopenharmony_ci 130162306a36Sopenharmony_ci css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 130262306a36Sopenharmony_ci while (!ret && (task = css_task_iter_next(&it))) 130362306a36Sopenharmony_ci ret = fn(task, arg); 130462306a36Sopenharmony_ci css_task_iter_end(&it); 130562306a36Sopenharmony_ci if (ret) { 130662306a36Sopenharmony_ci mem_cgroup_iter_break(memcg, iter); 130762306a36Sopenharmony_ci break; 130862306a36Sopenharmony_ci } 130962306a36Sopenharmony_ci } 131062306a36Sopenharmony_ci} 131162306a36Sopenharmony_ci 131262306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 131362306a36Sopenharmony_civoid lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 131462306a36Sopenharmony_ci{ 131562306a36Sopenharmony_ci struct mem_cgroup *memcg; 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_ci if (mem_cgroup_disabled()) 131862306a36Sopenharmony_ci return; 131962306a36Sopenharmony_ci 132062306a36Sopenharmony_ci memcg = folio_memcg(folio); 132162306a36Sopenharmony_ci 132262306a36Sopenharmony_ci if (!memcg) 132362306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); 132462306a36Sopenharmony_ci else 132562306a36Sopenharmony_ci VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); 132662306a36Sopenharmony_ci} 132762306a36Sopenharmony_ci#endif 132862306a36Sopenharmony_ci 132962306a36Sopenharmony_ci/** 133062306a36Sopenharmony_ci * folio_lruvec_lock - Lock the lruvec for a folio. 133162306a36Sopenharmony_ci * @folio: Pointer to the folio. 133262306a36Sopenharmony_ci * 133362306a36Sopenharmony_ci * These functions are safe to use under any of the following conditions: 133462306a36Sopenharmony_ci * - folio locked 133562306a36Sopenharmony_ci * - folio_test_lru false 133662306a36Sopenharmony_ci * - folio_memcg_lock() 133762306a36Sopenharmony_ci * - folio frozen (refcount of 0) 133862306a36Sopenharmony_ci * 133962306a36Sopenharmony_ci * Return: The lruvec this folio is on with its lock held. 134062306a36Sopenharmony_ci */ 134162306a36Sopenharmony_cistruct lruvec *folio_lruvec_lock(struct folio *folio) 134262306a36Sopenharmony_ci{ 134362306a36Sopenharmony_ci struct lruvec *lruvec = folio_lruvec(folio); 134462306a36Sopenharmony_ci 134562306a36Sopenharmony_ci spin_lock(&lruvec->lru_lock); 134662306a36Sopenharmony_ci lruvec_memcg_debug(lruvec, folio); 134762306a36Sopenharmony_ci 134862306a36Sopenharmony_ci return lruvec; 134962306a36Sopenharmony_ci} 135062306a36Sopenharmony_ci 135162306a36Sopenharmony_ci/** 135262306a36Sopenharmony_ci * folio_lruvec_lock_irq - Lock the lruvec for a folio. 135362306a36Sopenharmony_ci * @folio: Pointer to the folio. 135462306a36Sopenharmony_ci * 135562306a36Sopenharmony_ci * These functions are safe to use under any of the following conditions: 135662306a36Sopenharmony_ci * - folio locked 135762306a36Sopenharmony_ci * - folio_test_lru false 135862306a36Sopenharmony_ci * - folio_memcg_lock() 135962306a36Sopenharmony_ci * - folio frozen (refcount of 0) 136062306a36Sopenharmony_ci * 136162306a36Sopenharmony_ci * Return: The lruvec this folio is on with its lock held and interrupts 136262306a36Sopenharmony_ci * disabled. 136362306a36Sopenharmony_ci */ 136462306a36Sopenharmony_cistruct lruvec *folio_lruvec_lock_irq(struct folio *folio) 136562306a36Sopenharmony_ci{ 136662306a36Sopenharmony_ci struct lruvec *lruvec = folio_lruvec(folio); 136762306a36Sopenharmony_ci 136862306a36Sopenharmony_ci spin_lock_irq(&lruvec->lru_lock); 136962306a36Sopenharmony_ci lruvec_memcg_debug(lruvec, folio); 137062306a36Sopenharmony_ci 137162306a36Sopenharmony_ci return lruvec; 137262306a36Sopenharmony_ci} 137362306a36Sopenharmony_ci 137462306a36Sopenharmony_ci/** 137562306a36Sopenharmony_ci * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. 137662306a36Sopenharmony_ci * @folio: Pointer to the folio. 137762306a36Sopenharmony_ci * @flags: Pointer to irqsave flags. 137862306a36Sopenharmony_ci * 137962306a36Sopenharmony_ci * These functions are safe to use under any of the following conditions: 138062306a36Sopenharmony_ci * - folio locked 138162306a36Sopenharmony_ci * - folio_test_lru false 138262306a36Sopenharmony_ci * - folio_memcg_lock() 138362306a36Sopenharmony_ci * - folio frozen (refcount of 0) 138462306a36Sopenharmony_ci * 138562306a36Sopenharmony_ci * Return: The lruvec this folio is on with its lock held and interrupts 138662306a36Sopenharmony_ci * disabled. 138762306a36Sopenharmony_ci */ 138862306a36Sopenharmony_cistruct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 138962306a36Sopenharmony_ci unsigned long *flags) 139062306a36Sopenharmony_ci{ 139162306a36Sopenharmony_ci struct lruvec *lruvec = folio_lruvec(folio); 139262306a36Sopenharmony_ci 139362306a36Sopenharmony_ci spin_lock_irqsave(&lruvec->lru_lock, *flags); 139462306a36Sopenharmony_ci lruvec_memcg_debug(lruvec, folio); 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_ci return lruvec; 139762306a36Sopenharmony_ci} 139862306a36Sopenharmony_ci 139962306a36Sopenharmony_ci/** 140062306a36Sopenharmony_ci * mem_cgroup_update_lru_size - account for adding or removing an lru page 140162306a36Sopenharmony_ci * @lruvec: mem_cgroup per zone lru vector 140262306a36Sopenharmony_ci * @lru: index of lru list the page is sitting on 140362306a36Sopenharmony_ci * @zid: zone id of the accounted pages 140462306a36Sopenharmony_ci * @nr_pages: positive when adding or negative when removing 140562306a36Sopenharmony_ci * 140662306a36Sopenharmony_ci * This function must be called under lru_lock, just before a page is added 140762306a36Sopenharmony_ci * to or just after a page is removed from an lru list. 140862306a36Sopenharmony_ci */ 140962306a36Sopenharmony_civoid mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 141062306a36Sopenharmony_ci int zid, int nr_pages) 141162306a36Sopenharmony_ci{ 141262306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 141362306a36Sopenharmony_ci unsigned long *lru_size; 141462306a36Sopenharmony_ci long size; 141562306a36Sopenharmony_ci 141662306a36Sopenharmony_ci if (mem_cgroup_disabled()) 141762306a36Sopenharmony_ci return; 141862306a36Sopenharmony_ci 141962306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 142062306a36Sopenharmony_ci if (is_node_lruvec(lruvec)) 142162306a36Sopenharmony_ci return; 142262306a36Sopenharmony_ci#endif 142362306a36Sopenharmony_ci 142462306a36Sopenharmony_ci mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 142562306a36Sopenharmony_ci lru_size = &mz->lru_zone_size[zid][lru]; 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_ci if (nr_pages < 0) 142862306a36Sopenharmony_ci *lru_size += nr_pages; 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci size = *lru_size; 143162306a36Sopenharmony_ci if (WARN_ONCE(size < 0, 143262306a36Sopenharmony_ci "%s(%p, %d, %d): lru_size %ld\n", 143362306a36Sopenharmony_ci __func__, lruvec, lru, nr_pages, size)) { 143462306a36Sopenharmony_ci VM_BUG_ON(1); 143562306a36Sopenharmony_ci *lru_size = 0; 143662306a36Sopenharmony_ci } 143762306a36Sopenharmony_ci 143862306a36Sopenharmony_ci if (nr_pages > 0) 143962306a36Sopenharmony_ci *lru_size += nr_pages; 144062306a36Sopenharmony_ci} 144162306a36Sopenharmony_ci 144262306a36Sopenharmony_ci/** 144362306a36Sopenharmony_ci * mem_cgroup_margin - calculate chargeable space of a memory cgroup 144462306a36Sopenharmony_ci * @memcg: the memory cgroup 144562306a36Sopenharmony_ci * 144662306a36Sopenharmony_ci * Returns the maximum amount of memory @mem can be charged with, in 144762306a36Sopenharmony_ci * pages. 144862306a36Sopenharmony_ci */ 144962306a36Sopenharmony_cistatic unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 145062306a36Sopenharmony_ci{ 145162306a36Sopenharmony_ci unsigned long margin = 0; 145262306a36Sopenharmony_ci unsigned long count; 145362306a36Sopenharmony_ci unsigned long limit; 145462306a36Sopenharmony_ci 145562306a36Sopenharmony_ci count = page_counter_read(&memcg->memory); 145662306a36Sopenharmony_ci limit = READ_ONCE(memcg->memory.max); 145762306a36Sopenharmony_ci if (count < limit) 145862306a36Sopenharmony_ci margin = limit - count; 145962306a36Sopenharmony_ci 146062306a36Sopenharmony_ci if (do_memsw_account()) { 146162306a36Sopenharmony_ci count = page_counter_read(&memcg->memsw); 146262306a36Sopenharmony_ci limit = READ_ONCE(memcg->memsw.max); 146362306a36Sopenharmony_ci if (count < limit) 146462306a36Sopenharmony_ci margin = min(margin, limit - count); 146562306a36Sopenharmony_ci else 146662306a36Sopenharmony_ci margin = 0; 146762306a36Sopenharmony_ci } 146862306a36Sopenharmony_ci 146962306a36Sopenharmony_ci return margin; 147062306a36Sopenharmony_ci} 147162306a36Sopenharmony_ci 147262306a36Sopenharmony_ci/* 147362306a36Sopenharmony_ci * A routine for checking "mem" is under move_account() or not. 147462306a36Sopenharmony_ci * 147562306a36Sopenharmony_ci * Checking a cgroup is mc.from or mc.to or under hierarchy of 147662306a36Sopenharmony_ci * moving cgroups. This is for waiting at high-memory pressure 147762306a36Sopenharmony_ci * caused by "move". 147862306a36Sopenharmony_ci */ 147962306a36Sopenharmony_cistatic bool mem_cgroup_under_move(struct mem_cgroup *memcg) 148062306a36Sopenharmony_ci{ 148162306a36Sopenharmony_ci struct mem_cgroup *from; 148262306a36Sopenharmony_ci struct mem_cgroup *to; 148362306a36Sopenharmony_ci bool ret = false; 148462306a36Sopenharmony_ci /* 148562306a36Sopenharmony_ci * Unlike task_move routines, we access mc.to, mc.from not under 148662306a36Sopenharmony_ci * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 148762306a36Sopenharmony_ci */ 148862306a36Sopenharmony_ci spin_lock(&mc.lock); 148962306a36Sopenharmony_ci from = mc.from; 149062306a36Sopenharmony_ci to = mc.to; 149162306a36Sopenharmony_ci if (!from) 149262306a36Sopenharmony_ci goto unlock; 149362306a36Sopenharmony_ci 149462306a36Sopenharmony_ci ret = mem_cgroup_is_descendant(from, memcg) || 149562306a36Sopenharmony_ci mem_cgroup_is_descendant(to, memcg); 149662306a36Sopenharmony_ciunlock: 149762306a36Sopenharmony_ci spin_unlock(&mc.lock); 149862306a36Sopenharmony_ci return ret; 149962306a36Sopenharmony_ci} 150062306a36Sopenharmony_ci 150162306a36Sopenharmony_cistatic bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 150262306a36Sopenharmony_ci{ 150362306a36Sopenharmony_ci if (mc.moving_task && current != mc.moving_task) { 150462306a36Sopenharmony_ci if (mem_cgroup_under_move(memcg)) { 150562306a36Sopenharmony_ci DEFINE_WAIT(wait); 150662306a36Sopenharmony_ci prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 150762306a36Sopenharmony_ci /* moving charge context might have finished. */ 150862306a36Sopenharmony_ci if (mc.moving_task) 150962306a36Sopenharmony_ci schedule(); 151062306a36Sopenharmony_ci finish_wait(&mc.waitq, &wait); 151162306a36Sopenharmony_ci return true; 151262306a36Sopenharmony_ci } 151362306a36Sopenharmony_ci } 151462306a36Sopenharmony_ci return false; 151562306a36Sopenharmony_ci} 151662306a36Sopenharmony_ci 151762306a36Sopenharmony_cistruct memory_stat { 151862306a36Sopenharmony_ci const char *name; 151962306a36Sopenharmony_ci unsigned int idx; 152062306a36Sopenharmony_ci}; 152162306a36Sopenharmony_ci 152262306a36Sopenharmony_cistatic const struct memory_stat memory_stats[] = { 152362306a36Sopenharmony_ci { "anon", NR_ANON_MAPPED }, 152462306a36Sopenharmony_ci { "file", NR_FILE_PAGES }, 152562306a36Sopenharmony_ci { "kernel", MEMCG_KMEM }, 152662306a36Sopenharmony_ci { "kernel_stack", NR_KERNEL_STACK_KB }, 152762306a36Sopenharmony_ci { "pagetables", NR_PAGETABLE }, 152862306a36Sopenharmony_ci { "sec_pagetables", NR_SECONDARY_PAGETABLE }, 152962306a36Sopenharmony_ci { "percpu", MEMCG_PERCPU_B }, 153062306a36Sopenharmony_ci { "sock", MEMCG_SOCK }, 153162306a36Sopenharmony_ci { "vmalloc", MEMCG_VMALLOC }, 153262306a36Sopenharmony_ci { "shmem", NR_SHMEM }, 153362306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 153462306a36Sopenharmony_ci { "zswap", MEMCG_ZSWAP_B }, 153562306a36Sopenharmony_ci { "zswapped", MEMCG_ZSWAPPED }, 153662306a36Sopenharmony_ci#endif 153762306a36Sopenharmony_ci { "file_mapped", NR_FILE_MAPPED }, 153862306a36Sopenharmony_ci { "file_dirty", NR_FILE_DIRTY }, 153962306a36Sopenharmony_ci { "file_writeback", NR_WRITEBACK }, 154062306a36Sopenharmony_ci#ifdef CONFIG_SWAP 154162306a36Sopenharmony_ci { "swapcached", NR_SWAPCACHE }, 154262306a36Sopenharmony_ci#endif 154362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 154462306a36Sopenharmony_ci { "anon_thp", NR_ANON_THPS }, 154562306a36Sopenharmony_ci { "file_thp", NR_FILE_THPS }, 154662306a36Sopenharmony_ci { "shmem_thp", NR_SHMEM_THPS }, 154762306a36Sopenharmony_ci#endif 154862306a36Sopenharmony_ci { "inactive_anon", NR_INACTIVE_ANON }, 154962306a36Sopenharmony_ci { "active_anon", NR_ACTIVE_ANON }, 155062306a36Sopenharmony_ci { "inactive_file", NR_INACTIVE_FILE }, 155162306a36Sopenharmony_ci { "active_file", NR_ACTIVE_FILE }, 155262306a36Sopenharmony_ci { "unevictable", NR_UNEVICTABLE }, 155362306a36Sopenharmony_ci { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 155462306a36Sopenharmony_ci { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 155562306a36Sopenharmony_ci 155662306a36Sopenharmony_ci /* The memory events */ 155762306a36Sopenharmony_ci { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 155862306a36Sopenharmony_ci { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 155962306a36Sopenharmony_ci { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 156062306a36Sopenharmony_ci { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 156162306a36Sopenharmony_ci { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 156262306a36Sopenharmony_ci { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 156362306a36Sopenharmony_ci { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 156462306a36Sopenharmony_ci}; 156562306a36Sopenharmony_ci 156662306a36Sopenharmony_ci/* Translate stat items to the correct unit for memory.stat output */ 156762306a36Sopenharmony_cistatic int memcg_page_state_unit(int item) 156862306a36Sopenharmony_ci{ 156962306a36Sopenharmony_ci switch (item) { 157062306a36Sopenharmony_ci case MEMCG_PERCPU_B: 157162306a36Sopenharmony_ci case MEMCG_ZSWAP_B: 157262306a36Sopenharmony_ci case NR_SLAB_RECLAIMABLE_B: 157362306a36Sopenharmony_ci case NR_SLAB_UNRECLAIMABLE_B: 157462306a36Sopenharmony_ci case WORKINGSET_REFAULT_ANON: 157562306a36Sopenharmony_ci case WORKINGSET_REFAULT_FILE: 157662306a36Sopenharmony_ci case WORKINGSET_ACTIVATE_ANON: 157762306a36Sopenharmony_ci case WORKINGSET_ACTIVATE_FILE: 157862306a36Sopenharmony_ci case WORKINGSET_RESTORE_ANON: 157962306a36Sopenharmony_ci case WORKINGSET_RESTORE_FILE: 158062306a36Sopenharmony_ci case WORKINGSET_NODERECLAIM: 158162306a36Sopenharmony_ci return 1; 158262306a36Sopenharmony_ci case NR_KERNEL_STACK_KB: 158362306a36Sopenharmony_ci return SZ_1K; 158462306a36Sopenharmony_ci default: 158562306a36Sopenharmony_ci return PAGE_SIZE; 158662306a36Sopenharmony_ci } 158762306a36Sopenharmony_ci} 158862306a36Sopenharmony_ci 158962306a36Sopenharmony_cistatic inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, 159062306a36Sopenharmony_ci int item) 159162306a36Sopenharmony_ci{ 159262306a36Sopenharmony_ci return memcg_page_state(memcg, item) * memcg_page_state_unit(item); 159362306a36Sopenharmony_ci} 159462306a36Sopenharmony_ci 159562306a36Sopenharmony_cistatic void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 159662306a36Sopenharmony_ci{ 159762306a36Sopenharmony_ci int i; 159862306a36Sopenharmony_ci 159962306a36Sopenharmony_ci /* 160062306a36Sopenharmony_ci * Provide statistics on the state of the memory subsystem as 160162306a36Sopenharmony_ci * well as cumulative event counters that show past behavior. 160262306a36Sopenharmony_ci * 160362306a36Sopenharmony_ci * This list is ordered following a combination of these gradients: 160462306a36Sopenharmony_ci * 1) generic big picture -> specifics and details 160562306a36Sopenharmony_ci * 2) reflecting userspace activity -> reflecting kernel heuristics 160662306a36Sopenharmony_ci * 160762306a36Sopenharmony_ci * Current memory state: 160862306a36Sopenharmony_ci */ 160962306a36Sopenharmony_ci mem_cgroup_flush_stats(); 161062306a36Sopenharmony_ci 161162306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 161262306a36Sopenharmony_ci u64 size; 161362306a36Sopenharmony_ci 161462306a36Sopenharmony_ci size = memcg_page_state_output(memcg, memory_stats[i].idx); 161562306a36Sopenharmony_ci seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); 161662306a36Sopenharmony_ci 161762306a36Sopenharmony_ci if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 161862306a36Sopenharmony_ci size += memcg_page_state_output(memcg, 161962306a36Sopenharmony_ci NR_SLAB_RECLAIMABLE_B); 162062306a36Sopenharmony_ci seq_buf_printf(s, "slab %llu\n", size); 162162306a36Sopenharmony_ci } 162262306a36Sopenharmony_ci } 162362306a36Sopenharmony_ci 162462306a36Sopenharmony_ci /* Accumulated memory events */ 162562306a36Sopenharmony_ci seq_buf_printf(s, "pgscan %lu\n", 162662306a36Sopenharmony_ci memcg_events(memcg, PGSCAN_KSWAPD) + 162762306a36Sopenharmony_ci memcg_events(memcg, PGSCAN_DIRECT) + 162862306a36Sopenharmony_ci memcg_events(memcg, PGSCAN_KHUGEPAGED)); 162962306a36Sopenharmony_ci seq_buf_printf(s, "pgsteal %lu\n", 163062306a36Sopenharmony_ci memcg_events(memcg, PGSTEAL_KSWAPD) + 163162306a36Sopenharmony_ci memcg_events(memcg, PGSTEAL_DIRECT) + 163262306a36Sopenharmony_ci memcg_events(memcg, PGSTEAL_KHUGEPAGED)); 163362306a36Sopenharmony_ci 163462306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { 163562306a36Sopenharmony_ci if (memcg_vm_event_stat[i] == PGPGIN || 163662306a36Sopenharmony_ci memcg_vm_event_stat[i] == PGPGOUT) 163762306a36Sopenharmony_ci continue; 163862306a36Sopenharmony_ci 163962306a36Sopenharmony_ci seq_buf_printf(s, "%s %lu\n", 164062306a36Sopenharmony_ci vm_event_name(memcg_vm_event_stat[i]), 164162306a36Sopenharmony_ci memcg_events(memcg, memcg_vm_event_stat[i])); 164262306a36Sopenharmony_ci } 164362306a36Sopenharmony_ci 164462306a36Sopenharmony_ci /* The above should easily fit into one page */ 164562306a36Sopenharmony_ci WARN_ON_ONCE(seq_buf_has_overflowed(s)); 164662306a36Sopenharmony_ci} 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_cistatic void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); 164962306a36Sopenharmony_ci 165062306a36Sopenharmony_cistatic void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 165162306a36Sopenharmony_ci{ 165262306a36Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 165362306a36Sopenharmony_ci memcg_stat_format(memcg, s); 165462306a36Sopenharmony_ci else 165562306a36Sopenharmony_ci memcg1_stat_format(memcg, s); 165662306a36Sopenharmony_ci WARN_ON_ONCE(seq_buf_has_overflowed(s)); 165762306a36Sopenharmony_ci} 165862306a36Sopenharmony_ci 165962306a36Sopenharmony_ci/** 166062306a36Sopenharmony_ci * mem_cgroup_print_oom_context: Print OOM information relevant to 166162306a36Sopenharmony_ci * memory controller. 166262306a36Sopenharmony_ci * @memcg: The memory cgroup that went over limit 166362306a36Sopenharmony_ci * @p: Task that is going to be killed 166462306a36Sopenharmony_ci * 166562306a36Sopenharmony_ci * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 166662306a36Sopenharmony_ci * enabled 166762306a36Sopenharmony_ci */ 166862306a36Sopenharmony_civoid mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 166962306a36Sopenharmony_ci{ 167062306a36Sopenharmony_ci rcu_read_lock(); 167162306a36Sopenharmony_ci 167262306a36Sopenharmony_ci if (memcg) { 167362306a36Sopenharmony_ci pr_cont(",oom_memcg="); 167462306a36Sopenharmony_ci pr_cont_cgroup_path(memcg->css.cgroup); 167562306a36Sopenharmony_ci } else 167662306a36Sopenharmony_ci pr_cont(",global_oom"); 167762306a36Sopenharmony_ci if (p) { 167862306a36Sopenharmony_ci pr_cont(",task_memcg="); 167962306a36Sopenharmony_ci pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 168062306a36Sopenharmony_ci } 168162306a36Sopenharmony_ci rcu_read_unlock(); 168262306a36Sopenharmony_ci} 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_ci/** 168562306a36Sopenharmony_ci * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 168662306a36Sopenharmony_ci * memory controller. 168762306a36Sopenharmony_ci * @memcg: The memory cgroup that went over limit 168862306a36Sopenharmony_ci */ 168962306a36Sopenharmony_civoid mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 169062306a36Sopenharmony_ci{ 169162306a36Sopenharmony_ci /* Use static buffer, for the caller is holding oom_lock. */ 169262306a36Sopenharmony_ci static char buf[PAGE_SIZE]; 169362306a36Sopenharmony_ci struct seq_buf s; 169462306a36Sopenharmony_ci 169562306a36Sopenharmony_ci lockdep_assert_held(&oom_lock); 169662306a36Sopenharmony_ci 169762306a36Sopenharmony_ci pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 169862306a36Sopenharmony_ci K((u64)page_counter_read(&memcg->memory)), 169962306a36Sopenharmony_ci K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 170062306a36Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 170162306a36Sopenharmony_ci pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 170262306a36Sopenharmony_ci K((u64)page_counter_read(&memcg->swap)), 170362306a36Sopenharmony_ci K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 170462306a36Sopenharmony_ci else { 170562306a36Sopenharmony_ci pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 170662306a36Sopenharmony_ci K((u64)page_counter_read(&memcg->memsw)), 170762306a36Sopenharmony_ci K((u64)memcg->memsw.max), memcg->memsw.failcnt); 170862306a36Sopenharmony_ci pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 170962306a36Sopenharmony_ci K((u64)page_counter_read(&memcg->kmem)), 171062306a36Sopenharmony_ci K((u64)memcg->kmem.max), memcg->kmem.failcnt); 171162306a36Sopenharmony_ci } 171262306a36Sopenharmony_ci 171362306a36Sopenharmony_ci pr_info("Memory cgroup stats for "); 171462306a36Sopenharmony_ci pr_cont_cgroup_path(memcg->css.cgroup); 171562306a36Sopenharmony_ci pr_cont(":"); 171662306a36Sopenharmony_ci seq_buf_init(&s, buf, sizeof(buf)); 171762306a36Sopenharmony_ci memory_stat_format(memcg, &s); 171862306a36Sopenharmony_ci seq_buf_do_printk(&s, KERN_INFO); 171962306a36Sopenharmony_ci} 172062306a36Sopenharmony_ci 172162306a36Sopenharmony_ci/* 172262306a36Sopenharmony_ci * Return the memory (and swap, if configured) limit for a memcg. 172362306a36Sopenharmony_ci */ 172462306a36Sopenharmony_ciunsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 172562306a36Sopenharmony_ci{ 172662306a36Sopenharmony_ci unsigned long max = READ_ONCE(memcg->memory.max); 172762306a36Sopenharmony_ci 172862306a36Sopenharmony_ci if (do_memsw_account()) { 172962306a36Sopenharmony_ci if (mem_cgroup_swappiness(memcg)) { 173062306a36Sopenharmony_ci /* Calculate swap excess capacity from memsw limit */ 173162306a36Sopenharmony_ci unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 173262306a36Sopenharmony_ci 173362306a36Sopenharmony_ci max += min(swap, (unsigned long)total_swap_pages); 173462306a36Sopenharmony_ci } 173562306a36Sopenharmony_ci } else { 173662306a36Sopenharmony_ci if (mem_cgroup_swappiness(memcg)) 173762306a36Sopenharmony_ci max += min(READ_ONCE(memcg->swap.max), 173862306a36Sopenharmony_ci (unsigned long)total_swap_pages); 173962306a36Sopenharmony_ci } 174062306a36Sopenharmony_ci return max; 174162306a36Sopenharmony_ci} 174262306a36Sopenharmony_ci 174362306a36Sopenharmony_ciunsigned long mem_cgroup_size(struct mem_cgroup *memcg) 174462306a36Sopenharmony_ci{ 174562306a36Sopenharmony_ci return page_counter_read(&memcg->memory); 174662306a36Sopenharmony_ci} 174762306a36Sopenharmony_ci 174862306a36Sopenharmony_cistatic bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 174962306a36Sopenharmony_ci int order) 175062306a36Sopenharmony_ci{ 175162306a36Sopenharmony_ci struct oom_control oc = { 175262306a36Sopenharmony_ci .zonelist = NULL, 175362306a36Sopenharmony_ci .nodemask = NULL, 175462306a36Sopenharmony_ci .memcg = memcg, 175562306a36Sopenharmony_ci .gfp_mask = gfp_mask, 175662306a36Sopenharmony_ci .order = order, 175762306a36Sopenharmony_ci }; 175862306a36Sopenharmony_ci bool ret = true; 175962306a36Sopenharmony_ci 176062306a36Sopenharmony_ci if (mutex_lock_killable(&oom_lock)) 176162306a36Sopenharmony_ci return true; 176262306a36Sopenharmony_ci 176362306a36Sopenharmony_ci if (mem_cgroup_margin(memcg) >= (1 << order)) 176462306a36Sopenharmony_ci goto unlock; 176562306a36Sopenharmony_ci 176662306a36Sopenharmony_ci /* 176762306a36Sopenharmony_ci * A few threads which were not waiting at mutex_lock_killable() can 176862306a36Sopenharmony_ci * fail to bail out. Therefore, check again after holding oom_lock. 176962306a36Sopenharmony_ci */ 177062306a36Sopenharmony_ci ret = task_is_dying() || out_of_memory(&oc); 177162306a36Sopenharmony_ci 177262306a36Sopenharmony_ciunlock: 177362306a36Sopenharmony_ci mutex_unlock(&oom_lock); 177462306a36Sopenharmony_ci return ret; 177562306a36Sopenharmony_ci} 177662306a36Sopenharmony_ci 177762306a36Sopenharmony_cistatic int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 177862306a36Sopenharmony_ci pg_data_t *pgdat, 177962306a36Sopenharmony_ci gfp_t gfp_mask, 178062306a36Sopenharmony_ci unsigned long *total_scanned) 178162306a36Sopenharmony_ci{ 178262306a36Sopenharmony_ci struct mem_cgroup *victim = NULL; 178362306a36Sopenharmony_ci int total = 0; 178462306a36Sopenharmony_ci int loop = 0; 178562306a36Sopenharmony_ci unsigned long excess; 178662306a36Sopenharmony_ci unsigned long nr_scanned; 178762306a36Sopenharmony_ci struct mem_cgroup_reclaim_cookie reclaim = { 178862306a36Sopenharmony_ci .pgdat = pgdat, 178962306a36Sopenharmony_ci }; 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci excess = soft_limit_excess(root_memcg); 179262306a36Sopenharmony_ci 179362306a36Sopenharmony_ci while (1) { 179462306a36Sopenharmony_ci victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 179562306a36Sopenharmony_ci if (!victim) { 179662306a36Sopenharmony_ci loop++; 179762306a36Sopenharmony_ci if (loop >= 2) { 179862306a36Sopenharmony_ci /* 179962306a36Sopenharmony_ci * If we have not been able to reclaim 180062306a36Sopenharmony_ci * anything, it might because there are 180162306a36Sopenharmony_ci * no reclaimable pages under this hierarchy 180262306a36Sopenharmony_ci */ 180362306a36Sopenharmony_ci if (!total) 180462306a36Sopenharmony_ci break; 180562306a36Sopenharmony_ci /* 180662306a36Sopenharmony_ci * We want to do more targeted reclaim. 180762306a36Sopenharmony_ci * excess >> 2 is not to excessive so as to 180862306a36Sopenharmony_ci * reclaim too much, nor too less that we keep 180962306a36Sopenharmony_ci * coming back to reclaim from this cgroup 181062306a36Sopenharmony_ci */ 181162306a36Sopenharmony_ci if (total >= (excess >> 2) || 181262306a36Sopenharmony_ci (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 181362306a36Sopenharmony_ci break; 181462306a36Sopenharmony_ci } 181562306a36Sopenharmony_ci continue; 181662306a36Sopenharmony_ci } 181762306a36Sopenharmony_ci total += mem_cgroup_shrink_node(victim, gfp_mask, false, 181862306a36Sopenharmony_ci pgdat, &nr_scanned); 181962306a36Sopenharmony_ci *total_scanned += nr_scanned; 182062306a36Sopenharmony_ci if (!soft_limit_excess(root_memcg)) 182162306a36Sopenharmony_ci break; 182262306a36Sopenharmony_ci } 182362306a36Sopenharmony_ci mem_cgroup_iter_break(root_memcg, victim); 182462306a36Sopenharmony_ci return total; 182562306a36Sopenharmony_ci} 182662306a36Sopenharmony_ci 182762306a36Sopenharmony_ci#ifdef CONFIG_LOCKDEP 182862306a36Sopenharmony_cistatic struct lockdep_map memcg_oom_lock_dep_map = { 182962306a36Sopenharmony_ci .name = "memcg_oom_lock", 183062306a36Sopenharmony_ci}; 183162306a36Sopenharmony_ci#endif 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_cistatic DEFINE_SPINLOCK(memcg_oom_lock); 183462306a36Sopenharmony_ci 183562306a36Sopenharmony_ci/* 183662306a36Sopenharmony_ci * Check OOM-Killer is already running under our hierarchy. 183762306a36Sopenharmony_ci * If someone is running, return false. 183862306a36Sopenharmony_ci */ 183962306a36Sopenharmony_cistatic bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 184062306a36Sopenharmony_ci{ 184162306a36Sopenharmony_ci struct mem_cgroup *iter, *failed = NULL; 184262306a36Sopenharmony_ci 184362306a36Sopenharmony_ci spin_lock(&memcg_oom_lock); 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) { 184662306a36Sopenharmony_ci if (iter->oom_lock) { 184762306a36Sopenharmony_ci /* 184862306a36Sopenharmony_ci * this subtree of our hierarchy is already locked 184962306a36Sopenharmony_ci * so we cannot give a lock. 185062306a36Sopenharmony_ci */ 185162306a36Sopenharmony_ci failed = iter; 185262306a36Sopenharmony_ci mem_cgroup_iter_break(memcg, iter); 185362306a36Sopenharmony_ci break; 185462306a36Sopenharmony_ci } else 185562306a36Sopenharmony_ci iter->oom_lock = true; 185662306a36Sopenharmony_ci } 185762306a36Sopenharmony_ci 185862306a36Sopenharmony_ci if (failed) { 185962306a36Sopenharmony_ci /* 186062306a36Sopenharmony_ci * OK, we failed to lock the whole subtree so we have 186162306a36Sopenharmony_ci * to clean up what we set up to the failing subtree 186262306a36Sopenharmony_ci */ 186362306a36Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) { 186462306a36Sopenharmony_ci if (iter == failed) { 186562306a36Sopenharmony_ci mem_cgroup_iter_break(memcg, iter); 186662306a36Sopenharmony_ci break; 186762306a36Sopenharmony_ci } 186862306a36Sopenharmony_ci iter->oom_lock = false; 186962306a36Sopenharmony_ci } 187062306a36Sopenharmony_ci } else 187162306a36Sopenharmony_ci mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 187262306a36Sopenharmony_ci 187362306a36Sopenharmony_ci spin_unlock(&memcg_oom_lock); 187462306a36Sopenharmony_ci 187562306a36Sopenharmony_ci return !failed; 187662306a36Sopenharmony_ci} 187762306a36Sopenharmony_ci 187862306a36Sopenharmony_cistatic void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 187962306a36Sopenharmony_ci{ 188062306a36Sopenharmony_ci struct mem_cgroup *iter; 188162306a36Sopenharmony_ci 188262306a36Sopenharmony_ci spin_lock(&memcg_oom_lock); 188362306a36Sopenharmony_ci mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 188462306a36Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 188562306a36Sopenharmony_ci iter->oom_lock = false; 188662306a36Sopenharmony_ci spin_unlock(&memcg_oom_lock); 188762306a36Sopenharmony_ci} 188862306a36Sopenharmony_ci 188962306a36Sopenharmony_cistatic void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 189062306a36Sopenharmony_ci{ 189162306a36Sopenharmony_ci struct mem_cgroup *iter; 189262306a36Sopenharmony_ci 189362306a36Sopenharmony_ci spin_lock(&memcg_oom_lock); 189462306a36Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 189562306a36Sopenharmony_ci iter->under_oom++; 189662306a36Sopenharmony_ci spin_unlock(&memcg_oom_lock); 189762306a36Sopenharmony_ci} 189862306a36Sopenharmony_ci 189962306a36Sopenharmony_cistatic void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 190062306a36Sopenharmony_ci{ 190162306a36Sopenharmony_ci struct mem_cgroup *iter; 190262306a36Sopenharmony_ci 190362306a36Sopenharmony_ci /* 190462306a36Sopenharmony_ci * Be careful about under_oom underflows because a child memcg 190562306a36Sopenharmony_ci * could have been added after mem_cgroup_mark_under_oom. 190662306a36Sopenharmony_ci */ 190762306a36Sopenharmony_ci spin_lock(&memcg_oom_lock); 190862306a36Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 190962306a36Sopenharmony_ci if (iter->under_oom > 0) 191062306a36Sopenharmony_ci iter->under_oom--; 191162306a36Sopenharmony_ci spin_unlock(&memcg_oom_lock); 191262306a36Sopenharmony_ci} 191362306a36Sopenharmony_ci 191462306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 191562306a36Sopenharmony_ci 191662306a36Sopenharmony_cistruct oom_wait_info { 191762306a36Sopenharmony_ci struct mem_cgroup *memcg; 191862306a36Sopenharmony_ci wait_queue_entry_t wait; 191962306a36Sopenharmony_ci}; 192062306a36Sopenharmony_ci 192162306a36Sopenharmony_cistatic int memcg_oom_wake_function(wait_queue_entry_t *wait, 192262306a36Sopenharmony_ci unsigned mode, int sync, void *arg) 192362306a36Sopenharmony_ci{ 192462306a36Sopenharmony_ci struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 192562306a36Sopenharmony_ci struct mem_cgroup *oom_wait_memcg; 192662306a36Sopenharmony_ci struct oom_wait_info *oom_wait_info; 192762306a36Sopenharmony_ci 192862306a36Sopenharmony_ci oom_wait_info = container_of(wait, struct oom_wait_info, wait); 192962306a36Sopenharmony_ci oom_wait_memcg = oom_wait_info->memcg; 193062306a36Sopenharmony_ci 193162306a36Sopenharmony_ci if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 193262306a36Sopenharmony_ci !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 193362306a36Sopenharmony_ci return 0; 193462306a36Sopenharmony_ci return autoremove_wake_function(wait, mode, sync, arg); 193562306a36Sopenharmony_ci} 193662306a36Sopenharmony_ci 193762306a36Sopenharmony_cistatic void memcg_oom_recover(struct mem_cgroup *memcg) 193862306a36Sopenharmony_ci{ 193962306a36Sopenharmony_ci /* 194062306a36Sopenharmony_ci * For the following lockless ->under_oom test, the only required 194162306a36Sopenharmony_ci * guarantee is that it must see the state asserted by an OOM when 194262306a36Sopenharmony_ci * this function is called as a result of userland actions 194362306a36Sopenharmony_ci * triggered by the notification of the OOM. This is trivially 194462306a36Sopenharmony_ci * achieved by invoking mem_cgroup_mark_under_oom() before 194562306a36Sopenharmony_ci * triggering notification. 194662306a36Sopenharmony_ci */ 194762306a36Sopenharmony_ci if (memcg && memcg->under_oom) 194862306a36Sopenharmony_ci __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 194962306a36Sopenharmony_ci} 195062306a36Sopenharmony_ci 195162306a36Sopenharmony_ci/* 195262306a36Sopenharmony_ci * Returns true if successfully killed one or more processes. Though in some 195362306a36Sopenharmony_ci * corner cases it can return true even without killing any process. 195462306a36Sopenharmony_ci */ 195562306a36Sopenharmony_cistatic bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 195662306a36Sopenharmony_ci{ 195762306a36Sopenharmony_ci bool locked, ret; 195862306a36Sopenharmony_ci 195962306a36Sopenharmony_ci if (order > PAGE_ALLOC_COSTLY_ORDER) 196062306a36Sopenharmony_ci return false; 196162306a36Sopenharmony_ci 196262306a36Sopenharmony_ci memcg_memory_event(memcg, MEMCG_OOM); 196362306a36Sopenharmony_ci 196462306a36Sopenharmony_ci /* 196562306a36Sopenharmony_ci * We are in the middle of the charge context here, so we 196662306a36Sopenharmony_ci * don't want to block when potentially sitting on a callstack 196762306a36Sopenharmony_ci * that holds all kinds of filesystem and mm locks. 196862306a36Sopenharmony_ci * 196962306a36Sopenharmony_ci * cgroup1 allows disabling the OOM killer and waiting for outside 197062306a36Sopenharmony_ci * handling until the charge can succeed; remember the context and put 197162306a36Sopenharmony_ci * the task to sleep at the end of the page fault when all locks are 197262306a36Sopenharmony_ci * released. 197362306a36Sopenharmony_ci * 197462306a36Sopenharmony_ci * On the other hand, in-kernel OOM killer allows for an async victim 197562306a36Sopenharmony_ci * memory reclaim (oom_reaper) and that means that we are not solely 197662306a36Sopenharmony_ci * relying on the oom victim to make a forward progress and we can 197762306a36Sopenharmony_ci * invoke the oom killer here. 197862306a36Sopenharmony_ci * 197962306a36Sopenharmony_ci * Please note that mem_cgroup_out_of_memory might fail to find a 198062306a36Sopenharmony_ci * victim and then we have to bail out from the charge path. 198162306a36Sopenharmony_ci */ 198262306a36Sopenharmony_ci if (READ_ONCE(memcg->oom_kill_disable)) { 198362306a36Sopenharmony_ci if (current->in_user_fault) { 198462306a36Sopenharmony_ci css_get(&memcg->css); 198562306a36Sopenharmony_ci current->memcg_in_oom = memcg; 198662306a36Sopenharmony_ci current->memcg_oom_gfp_mask = mask; 198762306a36Sopenharmony_ci current->memcg_oom_order = order; 198862306a36Sopenharmony_ci } 198962306a36Sopenharmony_ci return false; 199062306a36Sopenharmony_ci } 199162306a36Sopenharmony_ci 199262306a36Sopenharmony_ci mem_cgroup_mark_under_oom(memcg); 199362306a36Sopenharmony_ci 199462306a36Sopenharmony_ci locked = mem_cgroup_oom_trylock(memcg); 199562306a36Sopenharmony_ci 199662306a36Sopenharmony_ci if (locked) 199762306a36Sopenharmony_ci mem_cgroup_oom_notify(memcg); 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci mem_cgroup_unmark_under_oom(memcg); 200062306a36Sopenharmony_ci ret = mem_cgroup_out_of_memory(memcg, mask, order); 200162306a36Sopenharmony_ci 200262306a36Sopenharmony_ci if (locked) 200362306a36Sopenharmony_ci mem_cgroup_oom_unlock(memcg); 200462306a36Sopenharmony_ci 200562306a36Sopenharmony_ci return ret; 200662306a36Sopenharmony_ci} 200762306a36Sopenharmony_ci 200862306a36Sopenharmony_ci/** 200962306a36Sopenharmony_ci * mem_cgroup_oom_synchronize - complete memcg OOM handling 201062306a36Sopenharmony_ci * @handle: actually kill/wait or just clean up the OOM state 201162306a36Sopenharmony_ci * 201262306a36Sopenharmony_ci * This has to be called at the end of a page fault if the memcg OOM 201362306a36Sopenharmony_ci * handler was enabled. 201462306a36Sopenharmony_ci * 201562306a36Sopenharmony_ci * Memcg supports userspace OOM handling where failed allocations must 201662306a36Sopenharmony_ci * sleep on a waitqueue until the userspace task resolves the 201762306a36Sopenharmony_ci * situation. Sleeping directly in the charge context with all kinds 201862306a36Sopenharmony_ci * of locks held is not a good idea, instead we remember an OOM state 201962306a36Sopenharmony_ci * in the task and mem_cgroup_oom_synchronize() has to be called at 202062306a36Sopenharmony_ci * the end of the page fault to complete the OOM handling. 202162306a36Sopenharmony_ci * 202262306a36Sopenharmony_ci * Returns %true if an ongoing memcg OOM situation was detected and 202362306a36Sopenharmony_ci * completed, %false otherwise. 202462306a36Sopenharmony_ci */ 202562306a36Sopenharmony_cibool mem_cgroup_oom_synchronize(bool handle) 202662306a36Sopenharmony_ci{ 202762306a36Sopenharmony_ci struct mem_cgroup *memcg = current->memcg_in_oom; 202862306a36Sopenharmony_ci struct oom_wait_info owait; 202962306a36Sopenharmony_ci bool locked; 203062306a36Sopenharmony_ci 203162306a36Sopenharmony_ci /* OOM is global, do not handle */ 203262306a36Sopenharmony_ci if (!memcg) 203362306a36Sopenharmony_ci return false; 203462306a36Sopenharmony_ci 203562306a36Sopenharmony_ci if (!handle) 203662306a36Sopenharmony_ci goto cleanup; 203762306a36Sopenharmony_ci 203862306a36Sopenharmony_ci owait.memcg = memcg; 203962306a36Sopenharmony_ci owait.wait.flags = 0; 204062306a36Sopenharmony_ci owait.wait.func = memcg_oom_wake_function; 204162306a36Sopenharmony_ci owait.wait.private = current; 204262306a36Sopenharmony_ci INIT_LIST_HEAD(&owait.wait.entry); 204362306a36Sopenharmony_ci 204462306a36Sopenharmony_ci prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 204562306a36Sopenharmony_ci mem_cgroup_mark_under_oom(memcg); 204662306a36Sopenharmony_ci 204762306a36Sopenharmony_ci locked = mem_cgroup_oom_trylock(memcg); 204862306a36Sopenharmony_ci 204962306a36Sopenharmony_ci if (locked) 205062306a36Sopenharmony_ci mem_cgroup_oom_notify(memcg); 205162306a36Sopenharmony_ci 205262306a36Sopenharmony_ci schedule(); 205362306a36Sopenharmony_ci mem_cgroup_unmark_under_oom(memcg); 205462306a36Sopenharmony_ci finish_wait(&memcg_oom_waitq, &owait.wait); 205562306a36Sopenharmony_ci 205662306a36Sopenharmony_ci if (locked) 205762306a36Sopenharmony_ci mem_cgroup_oom_unlock(memcg); 205862306a36Sopenharmony_cicleanup: 205962306a36Sopenharmony_ci current->memcg_in_oom = NULL; 206062306a36Sopenharmony_ci css_put(&memcg->css); 206162306a36Sopenharmony_ci return true; 206262306a36Sopenharmony_ci} 206362306a36Sopenharmony_ci 206462306a36Sopenharmony_ci/** 206562306a36Sopenharmony_ci * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 206662306a36Sopenharmony_ci * @victim: task to be killed by the OOM killer 206762306a36Sopenharmony_ci * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 206862306a36Sopenharmony_ci * 206962306a36Sopenharmony_ci * Returns a pointer to a memory cgroup, which has to be cleaned up 207062306a36Sopenharmony_ci * by killing all belonging OOM-killable tasks. 207162306a36Sopenharmony_ci * 207262306a36Sopenharmony_ci * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 207362306a36Sopenharmony_ci */ 207462306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 207562306a36Sopenharmony_ci struct mem_cgroup *oom_domain) 207662306a36Sopenharmony_ci{ 207762306a36Sopenharmony_ci struct mem_cgroup *oom_group = NULL; 207862306a36Sopenharmony_ci struct mem_cgroup *memcg; 207962306a36Sopenharmony_ci 208062306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 208162306a36Sopenharmony_ci return NULL; 208262306a36Sopenharmony_ci 208362306a36Sopenharmony_ci if (!oom_domain) 208462306a36Sopenharmony_ci oom_domain = root_mem_cgroup; 208562306a36Sopenharmony_ci 208662306a36Sopenharmony_ci rcu_read_lock(); 208762306a36Sopenharmony_ci 208862306a36Sopenharmony_ci memcg = mem_cgroup_from_task(victim); 208962306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 209062306a36Sopenharmony_ci goto out; 209162306a36Sopenharmony_ci 209262306a36Sopenharmony_ci /* 209362306a36Sopenharmony_ci * If the victim task has been asynchronously moved to a different 209462306a36Sopenharmony_ci * memory cgroup, we might end up killing tasks outside oom_domain. 209562306a36Sopenharmony_ci * In this case it's better to ignore memory.group.oom. 209662306a36Sopenharmony_ci */ 209762306a36Sopenharmony_ci if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 209862306a36Sopenharmony_ci goto out; 209962306a36Sopenharmony_ci 210062306a36Sopenharmony_ci /* 210162306a36Sopenharmony_ci * Traverse the memory cgroup hierarchy from the victim task's 210262306a36Sopenharmony_ci * cgroup up to the OOMing cgroup (or root) to find the 210362306a36Sopenharmony_ci * highest-level memory cgroup with oom.group set. 210462306a36Sopenharmony_ci */ 210562306a36Sopenharmony_ci for (; memcg; memcg = parent_mem_cgroup(memcg)) { 210662306a36Sopenharmony_ci if (READ_ONCE(memcg->oom_group)) 210762306a36Sopenharmony_ci oom_group = memcg; 210862306a36Sopenharmony_ci 210962306a36Sopenharmony_ci if (memcg == oom_domain) 211062306a36Sopenharmony_ci break; 211162306a36Sopenharmony_ci } 211262306a36Sopenharmony_ci 211362306a36Sopenharmony_ci if (oom_group) 211462306a36Sopenharmony_ci css_get(&oom_group->css); 211562306a36Sopenharmony_ciout: 211662306a36Sopenharmony_ci rcu_read_unlock(); 211762306a36Sopenharmony_ci 211862306a36Sopenharmony_ci return oom_group; 211962306a36Sopenharmony_ci} 212062306a36Sopenharmony_ci 212162306a36Sopenharmony_civoid mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 212262306a36Sopenharmony_ci{ 212362306a36Sopenharmony_ci pr_info("Tasks in "); 212462306a36Sopenharmony_ci pr_cont_cgroup_path(memcg->css.cgroup); 212562306a36Sopenharmony_ci pr_cont(" are going to be killed due to memory.oom.group set\n"); 212662306a36Sopenharmony_ci} 212762306a36Sopenharmony_ci 212862306a36Sopenharmony_ci/** 212962306a36Sopenharmony_ci * folio_memcg_lock - Bind a folio to its memcg. 213062306a36Sopenharmony_ci * @folio: The folio. 213162306a36Sopenharmony_ci * 213262306a36Sopenharmony_ci * This function prevents unlocked LRU folios from being moved to 213362306a36Sopenharmony_ci * another cgroup. 213462306a36Sopenharmony_ci * 213562306a36Sopenharmony_ci * It ensures lifetime of the bound memcg. The caller is responsible 213662306a36Sopenharmony_ci * for the lifetime of the folio. 213762306a36Sopenharmony_ci */ 213862306a36Sopenharmony_civoid folio_memcg_lock(struct folio *folio) 213962306a36Sopenharmony_ci{ 214062306a36Sopenharmony_ci struct mem_cgroup *memcg; 214162306a36Sopenharmony_ci unsigned long flags; 214262306a36Sopenharmony_ci 214362306a36Sopenharmony_ci /* 214462306a36Sopenharmony_ci * The RCU lock is held throughout the transaction. The fast 214562306a36Sopenharmony_ci * path can get away without acquiring the memcg->move_lock 214662306a36Sopenharmony_ci * because page moving starts with an RCU grace period. 214762306a36Sopenharmony_ci */ 214862306a36Sopenharmony_ci rcu_read_lock(); 214962306a36Sopenharmony_ci 215062306a36Sopenharmony_ci if (mem_cgroup_disabled()) 215162306a36Sopenharmony_ci return; 215262306a36Sopenharmony_ciagain: 215362306a36Sopenharmony_ci memcg = folio_memcg(folio); 215462306a36Sopenharmony_ci if (unlikely(!memcg)) 215562306a36Sopenharmony_ci return; 215662306a36Sopenharmony_ci 215762306a36Sopenharmony_ci#ifdef CONFIG_PROVE_LOCKING 215862306a36Sopenharmony_ci local_irq_save(flags); 215962306a36Sopenharmony_ci might_lock(&memcg->move_lock); 216062306a36Sopenharmony_ci local_irq_restore(flags); 216162306a36Sopenharmony_ci#endif 216262306a36Sopenharmony_ci 216362306a36Sopenharmony_ci if (atomic_read(&memcg->moving_account) <= 0) 216462306a36Sopenharmony_ci return; 216562306a36Sopenharmony_ci 216662306a36Sopenharmony_ci spin_lock_irqsave(&memcg->move_lock, flags); 216762306a36Sopenharmony_ci if (memcg != folio_memcg(folio)) { 216862306a36Sopenharmony_ci spin_unlock_irqrestore(&memcg->move_lock, flags); 216962306a36Sopenharmony_ci goto again; 217062306a36Sopenharmony_ci } 217162306a36Sopenharmony_ci 217262306a36Sopenharmony_ci /* 217362306a36Sopenharmony_ci * When charge migration first begins, we can have multiple 217462306a36Sopenharmony_ci * critical sections holding the fast-path RCU lock and one 217562306a36Sopenharmony_ci * holding the slowpath move_lock. Track the task who has the 217662306a36Sopenharmony_ci * move_lock for folio_memcg_unlock(). 217762306a36Sopenharmony_ci */ 217862306a36Sopenharmony_ci memcg->move_lock_task = current; 217962306a36Sopenharmony_ci memcg->move_lock_flags = flags; 218062306a36Sopenharmony_ci} 218162306a36Sopenharmony_ci 218262306a36Sopenharmony_cistatic void __folio_memcg_unlock(struct mem_cgroup *memcg) 218362306a36Sopenharmony_ci{ 218462306a36Sopenharmony_ci if (memcg && memcg->move_lock_task == current) { 218562306a36Sopenharmony_ci unsigned long flags = memcg->move_lock_flags; 218662306a36Sopenharmony_ci 218762306a36Sopenharmony_ci memcg->move_lock_task = NULL; 218862306a36Sopenharmony_ci memcg->move_lock_flags = 0; 218962306a36Sopenharmony_ci 219062306a36Sopenharmony_ci spin_unlock_irqrestore(&memcg->move_lock, flags); 219162306a36Sopenharmony_ci } 219262306a36Sopenharmony_ci 219362306a36Sopenharmony_ci rcu_read_unlock(); 219462306a36Sopenharmony_ci} 219562306a36Sopenharmony_ci 219662306a36Sopenharmony_ci/** 219762306a36Sopenharmony_ci * folio_memcg_unlock - Release the binding between a folio and its memcg. 219862306a36Sopenharmony_ci * @folio: The folio. 219962306a36Sopenharmony_ci * 220062306a36Sopenharmony_ci * This releases the binding created by folio_memcg_lock(). This does 220162306a36Sopenharmony_ci * not change the accounting of this folio to its memcg, but it does 220262306a36Sopenharmony_ci * permit others to change it. 220362306a36Sopenharmony_ci */ 220462306a36Sopenharmony_civoid folio_memcg_unlock(struct folio *folio) 220562306a36Sopenharmony_ci{ 220662306a36Sopenharmony_ci __folio_memcg_unlock(folio_memcg(folio)); 220762306a36Sopenharmony_ci} 220862306a36Sopenharmony_ci 220962306a36Sopenharmony_cistruct memcg_stock_pcp { 221062306a36Sopenharmony_ci local_lock_t stock_lock; 221162306a36Sopenharmony_ci struct mem_cgroup *cached; /* this never be root cgroup */ 221262306a36Sopenharmony_ci unsigned int nr_pages; 221362306a36Sopenharmony_ci 221462306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 221562306a36Sopenharmony_ci struct obj_cgroup *cached_objcg; 221662306a36Sopenharmony_ci struct pglist_data *cached_pgdat; 221762306a36Sopenharmony_ci unsigned int nr_bytes; 221862306a36Sopenharmony_ci int nr_slab_reclaimable_b; 221962306a36Sopenharmony_ci int nr_slab_unreclaimable_b; 222062306a36Sopenharmony_ci#endif 222162306a36Sopenharmony_ci 222262306a36Sopenharmony_ci struct work_struct work; 222362306a36Sopenharmony_ci unsigned long flags; 222462306a36Sopenharmony_ci#define FLUSHING_CACHED_CHARGE 0 222562306a36Sopenharmony_ci}; 222662306a36Sopenharmony_cistatic DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { 222762306a36Sopenharmony_ci .stock_lock = INIT_LOCAL_LOCK(stock_lock), 222862306a36Sopenharmony_ci}; 222962306a36Sopenharmony_cistatic DEFINE_MUTEX(percpu_charge_mutex); 223062306a36Sopenharmony_ci 223162306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 223262306a36Sopenharmony_cistatic struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); 223362306a36Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 223462306a36Sopenharmony_ci struct mem_cgroup *root_memcg); 223562306a36Sopenharmony_cistatic void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); 223662306a36Sopenharmony_ci 223762306a36Sopenharmony_ci#else 223862306a36Sopenharmony_cistatic inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 223962306a36Sopenharmony_ci{ 224062306a36Sopenharmony_ci return NULL; 224162306a36Sopenharmony_ci} 224262306a36Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 224362306a36Sopenharmony_ci struct mem_cgroup *root_memcg) 224462306a36Sopenharmony_ci{ 224562306a36Sopenharmony_ci return false; 224662306a36Sopenharmony_ci} 224762306a36Sopenharmony_cistatic void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 224862306a36Sopenharmony_ci{ 224962306a36Sopenharmony_ci} 225062306a36Sopenharmony_ci#endif 225162306a36Sopenharmony_ci 225262306a36Sopenharmony_ci/** 225362306a36Sopenharmony_ci * consume_stock: Try to consume stocked charge on this cpu. 225462306a36Sopenharmony_ci * @memcg: memcg to consume from. 225562306a36Sopenharmony_ci * @nr_pages: how many pages to charge. 225662306a36Sopenharmony_ci * 225762306a36Sopenharmony_ci * The charges will only happen if @memcg matches the current cpu's memcg 225862306a36Sopenharmony_ci * stock, and at least @nr_pages are available in that stock. Failure to 225962306a36Sopenharmony_ci * service an allocation will refill the stock. 226062306a36Sopenharmony_ci * 226162306a36Sopenharmony_ci * returns true if successful, false otherwise. 226262306a36Sopenharmony_ci */ 226362306a36Sopenharmony_cistatic bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 226462306a36Sopenharmony_ci{ 226562306a36Sopenharmony_ci struct memcg_stock_pcp *stock; 226662306a36Sopenharmony_ci unsigned long flags; 226762306a36Sopenharmony_ci bool ret = false; 226862306a36Sopenharmony_ci 226962306a36Sopenharmony_ci if (nr_pages > MEMCG_CHARGE_BATCH) 227062306a36Sopenharmony_ci return ret; 227162306a36Sopenharmony_ci 227262306a36Sopenharmony_ci local_lock_irqsave(&memcg_stock.stock_lock, flags); 227362306a36Sopenharmony_ci 227462306a36Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 227562306a36Sopenharmony_ci if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { 227662306a36Sopenharmony_ci stock->nr_pages -= nr_pages; 227762306a36Sopenharmony_ci ret = true; 227862306a36Sopenharmony_ci } 227962306a36Sopenharmony_ci 228062306a36Sopenharmony_ci local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 228162306a36Sopenharmony_ci 228262306a36Sopenharmony_ci return ret; 228362306a36Sopenharmony_ci} 228462306a36Sopenharmony_ci 228562306a36Sopenharmony_ci/* 228662306a36Sopenharmony_ci * Returns stocks cached in percpu and reset cached information. 228762306a36Sopenharmony_ci */ 228862306a36Sopenharmony_cistatic void drain_stock(struct memcg_stock_pcp *stock) 228962306a36Sopenharmony_ci{ 229062306a36Sopenharmony_ci struct mem_cgroup *old = READ_ONCE(stock->cached); 229162306a36Sopenharmony_ci 229262306a36Sopenharmony_ci if (!old) 229362306a36Sopenharmony_ci return; 229462306a36Sopenharmony_ci 229562306a36Sopenharmony_ci if (stock->nr_pages) { 229662306a36Sopenharmony_ci page_counter_uncharge(&old->memory, stock->nr_pages); 229762306a36Sopenharmony_ci if (do_memsw_account()) 229862306a36Sopenharmony_ci page_counter_uncharge(&old->memsw, stock->nr_pages); 229962306a36Sopenharmony_ci stock->nr_pages = 0; 230062306a36Sopenharmony_ci } 230162306a36Sopenharmony_ci 230262306a36Sopenharmony_ci css_put(&old->css); 230362306a36Sopenharmony_ci WRITE_ONCE(stock->cached, NULL); 230462306a36Sopenharmony_ci} 230562306a36Sopenharmony_ci 230662306a36Sopenharmony_cistatic void drain_local_stock(struct work_struct *dummy) 230762306a36Sopenharmony_ci{ 230862306a36Sopenharmony_ci struct memcg_stock_pcp *stock; 230962306a36Sopenharmony_ci struct obj_cgroup *old = NULL; 231062306a36Sopenharmony_ci unsigned long flags; 231162306a36Sopenharmony_ci 231262306a36Sopenharmony_ci /* 231362306a36Sopenharmony_ci * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. 231462306a36Sopenharmony_ci * drain_stock races is that we always operate on local CPU stock 231562306a36Sopenharmony_ci * here with IRQ disabled 231662306a36Sopenharmony_ci */ 231762306a36Sopenharmony_ci local_lock_irqsave(&memcg_stock.stock_lock, flags); 231862306a36Sopenharmony_ci 231962306a36Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 232062306a36Sopenharmony_ci old = drain_obj_stock(stock); 232162306a36Sopenharmony_ci drain_stock(stock); 232262306a36Sopenharmony_ci clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 232362306a36Sopenharmony_ci 232462306a36Sopenharmony_ci local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 232562306a36Sopenharmony_ci if (old) 232662306a36Sopenharmony_ci obj_cgroup_put(old); 232762306a36Sopenharmony_ci} 232862306a36Sopenharmony_ci 232962306a36Sopenharmony_ci/* 233062306a36Sopenharmony_ci * Cache charges(val) to local per_cpu area. 233162306a36Sopenharmony_ci * This will be consumed by consume_stock() function, later. 233262306a36Sopenharmony_ci */ 233362306a36Sopenharmony_cistatic void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 233462306a36Sopenharmony_ci{ 233562306a36Sopenharmony_ci struct memcg_stock_pcp *stock; 233662306a36Sopenharmony_ci 233762306a36Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 233862306a36Sopenharmony_ci if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ 233962306a36Sopenharmony_ci drain_stock(stock); 234062306a36Sopenharmony_ci css_get(&memcg->css); 234162306a36Sopenharmony_ci WRITE_ONCE(stock->cached, memcg); 234262306a36Sopenharmony_ci } 234362306a36Sopenharmony_ci stock->nr_pages += nr_pages; 234462306a36Sopenharmony_ci 234562306a36Sopenharmony_ci if (stock->nr_pages > MEMCG_CHARGE_BATCH) 234662306a36Sopenharmony_ci drain_stock(stock); 234762306a36Sopenharmony_ci} 234862306a36Sopenharmony_ci 234962306a36Sopenharmony_cistatic void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 235062306a36Sopenharmony_ci{ 235162306a36Sopenharmony_ci unsigned long flags; 235262306a36Sopenharmony_ci 235362306a36Sopenharmony_ci local_lock_irqsave(&memcg_stock.stock_lock, flags); 235462306a36Sopenharmony_ci __refill_stock(memcg, nr_pages); 235562306a36Sopenharmony_ci local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 235662306a36Sopenharmony_ci} 235762306a36Sopenharmony_ci 235862306a36Sopenharmony_ci/* 235962306a36Sopenharmony_ci * Drains all per-CPU charge caches for given root_memcg resp. subtree 236062306a36Sopenharmony_ci * of the hierarchy under it. 236162306a36Sopenharmony_ci */ 236262306a36Sopenharmony_cistatic void drain_all_stock(struct mem_cgroup *root_memcg) 236362306a36Sopenharmony_ci{ 236462306a36Sopenharmony_ci int cpu, curcpu; 236562306a36Sopenharmony_ci 236662306a36Sopenharmony_ci /* If someone's already draining, avoid adding running more workers. */ 236762306a36Sopenharmony_ci if (!mutex_trylock(&percpu_charge_mutex)) 236862306a36Sopenharmony_ci return; 236962306a36Sopenharmony_ci /* 237062306a36Sopenharmony_ci * Notify other cpus that system-wide "drain" is running 237162306a36Sopenharmony_ci * We do not care about races with the cpu hotplug because cpu down 237262306a36Sopenharmony_ci * as well as workers from this path always operate on the local 237362306a36Sopenharmony_ci * per-cpu data. CPU up doesn't touch memcg_stock at all. 237462306a36Sopenharmony_ci */ 237562306a36Sopenharmony_ci migrate_disable(); 237662306a36Sopenharmony_ci curcpu = smp_processor_id(); 237762306a36Sopenharmony_ci for_each_online_cpu(cpu) { 237862306a36Sopenharmony_ci struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 237962306a36Sopenharmony_ci struct mem_cgroup *memcg; 238062306a36Sopenharmony_ci bool flush = false; 238162306a36Sopenharmony_ci 238262306a36Sopenharmony_ci rcu_read_lock(); 238362306a36Sopenharmony_ci memcg = READ_ONCE(stock->cached); 238462306a36Sopenharmony_ci if (memcg && stock->nr_pages && 238562306a36Sopenharmony_ci mem_cgroup_is_descendant(memcg, root_memcg)) 238662306a36Sopenharmony_ci flush = true; 238762306a36Sopenharmony_ci else if (obj_stock_flush_required(stock, root_memcg)) 238862306a36Sopenharmony_ci flush = true; 238962306a36Sopenharmony_ci rcu_read_unlock(); 239062306a36Sopenharmony_ci 239162306a36Sopenharmony_ci if (flush && 239262306a36Sopenharmony_ci !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 239362306a36Sopenharmony_ci if (cpu == curcpu) 239462306a36Sopenharmony_ci drain_local_stock(&stock->work); 239562306a36Sopenharmony_ci else if (!cpu_is_isolated(cpu)) 239662306a36Sopenharmony_ci schedule_work_on(cpu, &stock->work); 239762306a36Sopenharmony_ci } 239862306a36Sopenharmony_ci } 239962306a36Sopenharmony_ci migrate_enable(); 240062306a36Sopenharmony_ci mutex_unlock(&percpu_charge_mutex); 240162306a36Sopenharmony_ci} 240262306a36Sopenharmony_ci 240362306a36Sopenharmony_cistatic int memcg_hotplug_cpu_dead(unsigned int cpu) 240462306a36Sopenharmony_ci{ 240562306a36Sopenharmony_ci struct memcg_stock_pcp *stock; 240662306a36Sopenharmony_ci 240762306a36Sopenharmony_ci stock = &per_cpu(memcg_stock, cpu); 240862306a36Sopenharmony_ci drain_stock(stock); 240962306a36Sopenharmony_ci 241062306a36Sopenharmony_ci return 0; 241162306a36Sopenharmony_ci} 241262306a36Sopenharmony_ci 241362306a36Sopenharmony_cistatic unsigned long reclaim_high(struct mem_cgroup *memcg, 241462306a36Sopenharmony_ci unsigned int nr_pages, 241562306a36Sopenharmony_ci gfp_t gfp_mask) 241662306a36Sopenharmony_ci{ 241762306a36Sopenharmony_ci unsigned long nr_reclaimed = 0; 241862306a36Sopenharmony_ci 241962306a36Sopenharmony_ci do { 242062306a36Sopenharmony_ci unsigned long pflags; 242162306a36Sopenharmony_ci 242262306a36Sopenharmony_ci if (page_counter_read(&memcg->memory) <= 242362306a36Sopenharmony_ci READ_ONCE(memcg->memory.high)) 242462306a36Sopenharmony_ci continue; 242562306a36Sopenharmony_ci 242662306a36Sopenharmony_ci memcg_memory_event(memcg, MEMCG_HIGH); 242762306a36Sopenharmony_ci 242862306a36Sopenharmony_ci psi_memstall_enter(&pflags); 242962306a36Sopenharmony_ci nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 243062306a36Sopenharmony_ci gfp_mask, 243162306a36Sopenharmony_ci MEMCG_RECLAIM_MAY_SWAP); 243262306a36Sopenharmony_ci psi_memstall_leave(&pflags); 243362306a36Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg)) && 243462306a36Sopenharmony_ci !mem_cgroup_is_root(memcg)); 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_ci return nr_reclaimed; 243762306a36Sopenharmony_ci} 243862306a36Sopenharmony_ci 243962306a36Sopenharmony_cistatic void high_work_func(struct work_struct *work) 244062306a36Sopenharmony_ci{ 244162306a36Sopenharmony_ci struct mem_cgroup *memcg; 244262306a36Sopenharmony_ci 244362306a36Sopenharmony_ci memcg = container_of(work, struct mem_cgroup, high_work); 244462306a36Sopenharmony_ci reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 244562306a36Sopenharmony_ci} 244662306a36Sopenharmony_ci 244762306a36Sopenharmony_ci/* 244862306a36Sopenharmony_ci * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 244962306a36Sopenharmony_ci * enough to still cause a significant slowdown in most cases, while still 245062306a36Sopenharmony_ci * allowing diagnostics and tracing to proceed without becoming stuck. 245162306a36Sopenharmony_ci */ 245262306a36Sopenharmony_ci#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 245362306a36Sopenharmony_ci 245462306a36Sopenharmony_ci/* 245562306a36Sopenharmony_ci * When calculating the delay, we use these either side of the exponentiation to 245662306a36Sopenharmony_ci * maintain precision and scale to a reasonable number of jiffies (see the table 245762306a36Sopenharmony_ci * below. 245862306a36Sopenharmony_ci * 245962306a36Sopenharmony_ci * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 246062306a36Sopenharmony_ci * overage ratio to a delay. 246162306a36Sopenharmony_ci * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 246262306a36Sopenharmony_ci * proposed penalty in order to reduce to a reasonable number of jiffies, and 246362306a36Sopenharmony_ci * to produce a reasonable delay curve. 246462306a36Sopenharmony_ci * 246562306a36Sopenharmony_ci * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 246662306a36Sopenharmony_ci * reasonable delay curve compared to precision-adjusted overage, not 246762306a36Sopenharmony_ci * penalising heavily at first, but still making sure that growth beyond the 246862306a36Sopenharmony_ci * limit penalises misbehaviour cgroups by slowing them down exponentially. For 246962306a36Sopenharmony_ci * example, with a high of 100 megabytes: 247062306a36Sopenharmony_ci * 247162306a36Sopenharmony_ci * +-------+------------------------+ 247262306a36Sopenharmony_ci * | usage | time to allocate in ms | 247362306a36Sopenharmony_ci * +-------+------------------------+ 247462306a36Sopenharmony_ci * | 100M | 0 | 247562306a36Sopenharmony_ci * | 101M | 6 | 247662306a36Sopenharmony_ci * | 102M | 25 | 247762306a36Sopenharmony_ci * | 103M | 57 | 247862306a36Sopenharmony_ci * | 104M | 102 | 247962306a36Sopenharmony_ci * | 105M | 159 | 248062306a36Sopenharmony_ci * | 106M | 230 | 248162306a36Sopenharmony_ci * | 107M | 313 | 248262306a36Sopenharmony_ci * | 108M | 409 | 248362306a36Sopenharmony_ci * | 109M | 518 | 248462306a36Sopenharmony_ci * | 110M | 639 | 248562306a36Sopenharmony_ci * | 111M | 774 | 248662306a36Sopenharmony_ci * | 112M | 921 | 248762306a36Sopenharmony_ci * | 113M | 1081 | 248862306a36Sopenharmony_ci * | 114M | 1254 | 248962306a36Sopenharmony_ci * | 115M | 1439 | 249062306a36Sopenharmony_ci * | 116M | 1638 | 249162306a36Sopenharmony_ci * | 117M | 1849 | 249262306a36Sopenharmony_ci * | 118M | 2000 | 249362306a36Sopenharmony_ci * | 119M | 2000 | 249462306a36Sopenharmony_ci * | 120M | 2000 | 249562306a36Sopenharmony_ci * +-------+------------------------+ 249662306a36Sopenharmony_ci */ 249762306a36Sopenharmony_ci #define MEMCG_DELAY_PRECISION_SHIFT 20 249862306a36Sopenharmony_ci #define MEMCG_DELAY_SCALING_SHIFT 14 249962306a36Sopenharmony_ci 250062306a36Sopenharmony_cistatic u64 calculate_overage(unsigned long usage, unsigned long high) 250162306a36Sopenharmony_ci{ 250262306a36Sopenharmony_ci u64 overage; 250362306a36Sopenharmony_ci 250462306a36Sopenharmony_ci if (usage <= high) 250562306a36Sopenharmony_ci return 0; 250662306a36Sopenharmony_ci 250762306a36Sopenharmony_ci /* 250862306a36Sopenharmony_ci * Prevent division by 0 in overage calculation by acting as if 250962306a36Sopenharmony_ci * it was a threshold of 1 page 251062306a36Sopenharmony_ci */ 251162306a36Sopenharmony_ci high = max(high, 1UL); 251262306a36Sopenharmony_ci 251362306a36Sopenharmony_ci overage = usage - high; 251462306a36Sopenharmony_ci overage <<= MEMCG_DELAY_PRECISION_SHIFT; 251562306a36Sopenharmony_ci return div64_u64(overage, high); 251662306a36Sopenharmony_ci} 251762306a36Sopenharmony_ci 251862306a36Sopenharmony_cistatic u64 mem_find_max_overage(struct mem_cgroup *memcg) 251962306a36Sopenharmony_ci{ 252062306a36Sopenharmony_ci u64 overage, max_overage = 0; 252162306a36Sopenharmony_ci 252262306a36Sopenharmony_ci do { 252362306a36Sopenharmony_ci overage = calculate_overage(page_counter_read(&memcg->memory), 252462306a36Sopenharmony_ci READ_ONCE(memcg->memory.high)); 252562306a36Sopenharmony_ci max_overage = max(overage, max_overage); 252662306a36Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg)) && 252762306a36Sopenharmony_ci !mem_cgroup_is_root(memcg)); 252862306a36Sopenharmony_ci 252962306a36Sopenharmony_ci return max_overage; 253062306a36Sopenharmony_ci} 253162306a36Sopenharmony_ci 253262306a36Sopenharmony_cistatic u64 swap_find_max_overage(struct mem_cgroup *memcg) 253362306a36Sopenharmony_ci{ 253462306a36Sopenharmony_ci u64 overage, max_overage = 0; 253562306a36Sopenharmony_ci 253662306a36Sopenharmony_ci do { 253762306a36Sopenharmony_ci overage = calculate_overage(page_counter_read(&memcg->swap), 253862306a36Sopenharmony_ci READ_ONCE(memcg->swap.high)); 253962306a36Sopenharmony_ci if (overage) 254062306a36Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 254162306a36Sopenharmony_ci max_overage = max(overage, max_overage); 254262306a36Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg)) && 254362306a36Sopenharmony_ci !mem_cgroup_is_root(memcg)); 254462306a36Sopenharmony_ci 254562306a36Sopenharmony_ci return max_overage; 254662306a36Sopenharmony_ci} 254762306a36Sopenharmony_ci 254862306a36Sopenharmony_ci/* 254962306a36Sopenharmony_ci * Get the number of jiffies that we should penalise a mischievous cgroup which 255062306a36Sopenharmony_ci * is exceeding its memory.high by checking both it and its ancestors. 255162306a36Sopenharmony_ci */ 255262306a36Sopenharmony_cistatic unsigned long calculate_high_delay(struct mem_cgroup *memcg, 255362306a36Sopenharmony_ci unsigned int nr_pages, 255462306a36Sopenharmony_ci u64 max_overage) 255562306a36Sopenharmony_ci{ 255662306a36Sopenharmony_ci unsigned long penalty_jiffies; 255762306a36Sopenharmony_ci 255862306a36Sopenharmony_ci if (!max_overage) 255962306a36Sopenharmony_ci return 0; 256062306a36Sopenharmony_ci 256162306a36Sopenharmony_ci /* 256262306a36Sopenharmony_ci * We use overage compared to memory.high to calculate the number of 256362306a36Sopenharmony_ci * jiffies to sleep (penalty_jiffies). Ideally this value should be 256462306a36Sopenharmony_ci * fairly lenient on small overages, and increasingly harsh when the 256562306a36Sopenharmony_ci * memcg in question makes it clear that it has no intention of stopping 256662306a36Sopenharmony_ci * its crazy behaviour, so we exponentially increase the delay based on 256762306a36Sopenharmony_ci * overage amount. 256862306a36Sopenharmony_ci */ 256962306a36Sopenharmony_ci penalty_jiffies = max_overage * max_overage * HZ; 257062306a36Sopenharmony_ci penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 257162306a36Sopenharmony_ci penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 257262306a36Sopenharmony_ci 257362306a36Sopenharmony_ci /* 257462306a36Sopenharmony_ci * Factor in the task's own contribution to the overage, such that four 257562306a36Sopenharmony_ci * N-sized allocations are throttled approximately the same as one 257662306a36Sopenharmony_ci * 4N-sized allocation. 257762306a36Sopenharmony_ci * 257862306a36Sopenharmony_ci * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 257962306a36Sopenharmony_ci * larger the current charge patch is than that. 258062306a36Sopenharmony_ci */ 258162306a36Sopenharmony_ci return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 258262306a36Sopenharmony_ci} 258362306a36Sopenharmony_ci 258462306a36Sopenharmony_ci/* 258562306a36Sopenharmony_ci * Scheduled by try_charge() to be executed from the userland return path 258662306a36Sopenharmony_ci * and reclaims memory over the high limit. 258762306a36Sopenharmony_ci */ 258862306a36Sopenharmony_civoid mem_cgroup_handle_over_high(gfp_t gfp_mask) 258962306a36Sopenharmony_ci{ 259062306a36Sopenharmony_ci unsigned long penalty_jiffies; 259162306a36Sopenharmony_ci unsigned long pflags; 259262306a36Sopenharmony_ci unsigned long nr_reclaimed; 259362306a36Sopenharmony_ci unsigned int nr_pages = current->memcg_nr_pages_over_high; 259462306a36Sopenharmony_ci int nr_retries = MAX_RECLAIM_RETRIES; 259562306a36Sopenharmony_ci struct mem_cgroup *memcg; 259662306a36Sopenharmony_ci bool in_retry = false; 259762306a36Sopenharmony_ci 259862306a36Sopenharmony_ci if (likely(!nr_pages)) 259962306a36Sopenharmony_ci return; 260062306a36Sopenharmony_ci 260162306a36Sopenharmony_ci memcg = get_mem_cgroup_from_mm(current->mm); 260262306a36Sopenharmony_ci current->memcg_nr_pages_over_high = 0; 260362306a36Sopenharmony_ci 260462306a36Sopenharmony_ciretry_reclaim: 260562306a36Sopenharmony_ci /* 260662306a36Sopenharmony_ci * The allocating task should reclaim at least the batch size, but for 260762306a36Sopenharmony_ci * subsequent retries we only want to do what's necessary to prevent oom 260862306a36Sopenharmony_ci * or breaching resource isolation. 260962306a36Sopenharmony_ci * 261062306a36Sopenharmony_ci * This is distinct from memory.max or page allocator behaviour because 261162306a36Sopenharmony_ci * memory.high is currently batched, whereas memory.max and the page 261262306a36Sopenharmony_ci * allocator run every time an allocation is made. 261362306a36Sopenharmony_ci */ 261462306a36Sopenharmony_ci nr_reclaimed = reclaim_high(memcg, 261562306a36Sopenharmony_ci in_retry ? SWAP_CLUSTER_MAX : nr_pages, 261662306a36Sopenharmony_ci gfp_mask); 261762306a36Sopenharmony_ci 261862306a36Sopenharmony_ci /* 261962306a36Sopenharmony_ci * memory.high is breached and reclaim is unable to keep up. Throttle 262062306a36Sopenharmony_ci * allocators proactively to slow down excessive growth. 262162306a36Sopenharmony_ci */ 262262306a36Sopenharmony_ci penalty_jiffies = calculate_high_delay(memcg, nr_pages, 262362306a36Sopenharmony_ci mem_find_max_overage(memcg)); 262462306a36Sopenharmony_ci 262562306a36Sopenharmony_ci penalty_jiffies += calculate_high_delay(memcg, nr_pages, 262662306a36Sopenharmony_ci swap_find_max_overage(memcg)); 262762306a36Sopenharmony_ci 262862306a36Sopenharmony_ci /* 262962306a36Sopenharmony_ci * Clamp the max delay per usermode return so as to still keep the 263062306a36Sopenharmony_ci * application moving forwards and also permit diagnostics, albeit 263162306a36Sopenharmony_ci * extremely slowly. 263262306a36Sopenharmony_ci */ 263362306a36Sopenharmony_ci penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 263462306a36Sopenharmony_ci 263562306a36Sopenharmony_ci /* 263662306a36Sopenharmony_ci * Don't sleep if the amount of jiffies this memcg owes us is so low 263762306a36Sopenharmony_ci * that it's not even worth doing, in an attempt to be nice to those who 263862306a36Sopenharmony_ci * go only a small amount over their memory.high value and maybe haven't 263962306a36Sopenharmony_ci * been aggressively reclaimed enough yet. 264062306a36Sopenharmony_ci */ 264162306a36Sopenharmony_ci if (penalty_jiffies <= HZ / 100) 264262306a36Sopenharmony_ci goto out; 264362306a36Sopenharmony_ci 264462306a36Sopenharmony_ci /* 264562306a36Sopenharmony_ci * If reclaim is making forward progress but we're still over 264662306a36Sopenharmony_ci * memory.high, we want to encourage that rather than doing allocator 264762306a36Sopenharmony_ci * throttling. 264862306a36Sopenharmony_ci */ 264962306a36Sopenharmony_ci if (nr_reclaimed || nr_retries--) { 265062306a36Sopenharmony_ci in_retry = true; 265162306a36Sopenharmony_ci goto retry_reclaim; 265262306a36Sopenharmony_ci } 265362306a36Sopenharmony_ci 265462306a36Sopenharmony_ci /* 265562306a36Sopenharmony_ci * If we exit early, we're guaranteed to die (since 265662306a36Sopenharmony_ci * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 265762306a36Sopenharmony_ci * need to account for any ill-begotten jiffies to pay them off later. 265862306a36Sopenharmony_ci */ 265962306a36Sopenharmony_ci psi_memstall_enter(&pflags); 266062306a36Sopenharmony_ci schedule_timeout_killable(penalty_jiffies); 266162306a36Sopenharmony_ci psi_memstall_leave(&pflags); 266262306a36Sopenharmony_ci 266362306a36Sopenharmony_ciout: 266462306a36Sopenharmony_ci css_put(&memcg->css); 266562306a36Sopenharmony_ci} 266662306a36Sopenharmony_ci 266762306a36Sopenharmony_cistatic int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, 266862306a36Sopenharmony_ci unsigned int nr_pages) 266962306a36Sopenharmony_ci{ 267062306a36Sopenharmony_ci unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 267162306a36Sopenharmony_ci int nr_retries = MAX_RECLAIM_RETRIES; 267262306a36Sopenharmony_ci struct mem_cgroup *mem_over_limit; 267362306a36Sopenharmony_ci struct page_counter *counter; 267462306a36Sopenharmony_ci unsigned long nr_reclaimed; 267562306a36Sopenharmony_ci bool passed_oom = false; 267662306a36Sopenharmony_ci unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; 267762306a36Sopenharmony_ci bool drained = false; 267862306a36Sopenharmony_ci bool raised_max_event = false; 267962306a36Sopenharmony_ci unsigned long pflags; 268062306a36Sopenharmony_ci 268162306a36Sopenharmony_ciretry: 268262306a36Sopenharmony_ci if (consume_stock(memcg, nr_pages)) 268362306a36Sopenharmony_ci return 0; 268462306a36Sopenharmony_ci 268562306a36Sopenharmony_ci if (!do_memsw_account() || 268662306a36Sopenharmony_ci page_counter_try_charge(&memcg->memsw, batch, &counter)) { 268762306a36Sopenharmony_ci if (page_counter_try_charge(&memcg->memory, batch, &counter)) 268862306a36Sopenharmony_ci goto done_restock; 268962306a36Sopenharmony_ci if (do_memsw_account()) 269062306a36Sopenharmony_ci page_counter_uncharge(&memcg->memsw, batch); 269162306a36Sopenharmony_ci mem_over_limit = mem_cgroup_from_counter(counter, memory); 269262306a36Sopenharmony_ci } else { 269362306a36Sopenharmony_ci mem_over_limit = mem_cgroup_from_counter(counter, memsw); 269462306a36Sopenharmony_ci reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; 269562306a36Sopenharmony_ci } 269662306a36Sopenharmony_ci 269762306a36Sopenharmony_ci if (batch > nr_pages) { 269862306a36Sopenharmony_ci batch = nr_pages; 269962306a36Sopenharmony_ci goto retry; 270062306a36Sopenharmony_ci } 270162306a36Sopenharmony_ci 270262306a36Sopenharmony_ci /* 270362306a36Sopenharmony_ci * Prevent unbounded recursion when reclaim operations need to 270462306a36Sopenharmony_ci * allocate memory. This might exceed the limits temporarily, 270562306a36Sopenharmony_ci * but we prefer facilitating memory reclaim and getting back 270662306a36Sopenharmony_ci * under the limit over triggering OOM kills in these cases. 270762306a36Sopenharmony_ci */ 270862306a36Sopenharmony_ci if (unlikely(current->flags & PF_MEMALLOC)) 270962306a36Sopenharmony_ci goto force; 271062306a36Sopenharmony_ci 271162306a36Sopenharmony_ci if (unlikely(task_in_memcg_oom(current))) 271262306a36Sopenharmony_ci goto nomem; 271362306a36Sopenharmony_ci 271462306a36Sopenharmony_ci if (!gfpflags_allow_blocking(gfp_mask)) 271562306a36Sopenharmony_ci goto nomem; 271662306a36Sopenharmony_ci 271762306a36Sopenharmony_ci memcg_memory_event(mem_over_limit, MEMCG_MAX); 271862306a36Sopenharmony_ci raised_max_event = true; 271962306a36Sopenharmony_ci 272062306a36Sopenharmony_ci psi_memstall_enter(&pflags); 272162306a36Sopenharmony_ci nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 272262306a36Sopenharmony_ci gfp_mask, reclaim_options); 272362306a36Sopenharmony_ci psi_memstall_leave(&pflags); 272462306a36Sopenharmony_ci 272562306a36Sopenharmony_ci if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 272662306a36Sopenharmony_ci goto retry; 272762306a36Sopenharmony_ci 272862306a36Sopenharmony_ci if (!drained) { 272962306a36Sopenharmony_ci drain_all_stock(mem_over_limit); 273062306a36Sopenharmony_ci drained = true; 273162306a36Sopenharmony_ci goto retry; 273262306a36Sopenharmony_ci } 273362306a36Sopenharmony_ci 273462306a36Sopenharmony_ci if (gfp_mask & __GFP_NORETRY) 273562306a36Sopenharmony_ci goto nomem; 273662306a36Sopenharmony_ci /* 273762306a36Sopenharmony_ci * Even though the limit is exceeded at this point, reclaim 273862306a36Sopenharmony_ci * may have been able to free some pages. Retry the charge 273962306a36Sopenharmony_ci * before killing the task. 274062306a36Sopenharmony_ci * 274162306a36Sopenharmony_ci * Only for regular pages, though: huge pages are rather 274262306a36Sopenharmony_ci * unlikely to succeed so close to the limit, and we fall back 274362306a36Sopenharmony_ci * to regular pages anyway in case of failure. 274462306a36Sopenharmony_ci */ 274562306a36Sopenharmony_ci if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 274662306a36Sopenharmony_ci goto retry; 274762306a36Sopenharmony_ci /* 274862306a36Sopenharmony_ci * At task move, charge accounts can be doubly counted. So, it's 274962306a36Sopenharmony_ci * better to wait until the end of task_move if something is going on. 275062306a36Sopenharmony_ci */ 275162306a36Sopenharmony_ci if (mem_cgroup_wait_acct_move(mem_over_limit)) 275262306a36Sopenharmony_ci goto retry; 275362306a36Sopenharmony_ci 275462306a36Sopenharmony_ci if (nr_retries--) 275562306a36Sopenharmony_ci goto retry; 275662306a36Sopenharmony_ci 275762306a36Sopenharmony_ci if (gfp_mask & __GFP_RETRY_MAYFAIL) 275862306a36Sopenharmony_ci goto nomem; 275962306a36Sopenharmony_ci 276062306a36Sopenharmony_ci /* Avoid endless loop for tasks bypassed by the oom killer */ 276162306a36Sopenharmony_ci if (passed_oom && task_is_dying()) 276262306a36Sopenharmony_ci goto nomem; 276362306a36Sopenharmony_ci 276462306a36Sopenharmony_ci /* 276562306a36Sopenharmony_ci * keep retrying as long as the memcg oom killer is able to make 276662306a36Sopenharmony_ci * a forward progress or bypass the charge if the oom killer 276762306a36Sopenharmony_ci * couldn't make any progress. 276862306a36Sopenharmony_ci */ 276962306a36Sopenharmony_ci if (mem_cgroup_oom(mem_over_limit, gfp_mask, 277062306a36Sopenharmony_ci get_order(nr_pages * PAGE_SIZE))) { 277162306a36Sopenharmony_ci passed_oom = true; 277262306a36Sopenharmony_ci nr_retries = MAX_RECLAIM_RETRIES; 277362306a36Sopenharmony_ci goto retry; 277462306a36Sopenharmony_ci } 277562306a36Sopenharmony_cinomem: 277662306a36Sopenharmony_ci /* 277762306a36Sopenharmony_ci * Memcg doesn't have a dedicated reserve for atomic 277862306a36Sopenharmony_ci * allocations. But like the global atomic pool, we need to 277962306a36Sopenharmony_ci * put the burden of reclaim on regular allocation requests 278062306a36Sopenharmony_ci * and let these go through as privileged allocations. 278162306a36Sopenharmony_ci */ 278262306a36Sopenharmony_ci if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) 278362306a36Sopenharmony_ci return -ENOMEM; 278462306a36Sopenharmony_ciforce: 278562306a36Sopenharmony_ci /* 278662306a36Sopenharmony_ci * If the allocation has to be enforced, don't forget to raise 278762306a36Sopenharmony_ci * a MEMCG_MAX event. 278862306a36Sopenharmony_ci */ 278962306a36Sopenharmony_ci if (!raised_max_event) 279062306a36Sopenharmony_ci memcg_memory_event(mem_over_limit, MEMCG_MAX); 279162306a36Sopenharmony_ci 279262306a36Sopenharmony_ci /* 279362306a36Sopenharmony_ci * The allocation either can't fail or will lead to more memory 279462306a36Sopenharmony_ci * being freed very soon. Allow memory usage go over the limit 279562306a36Sopenharmony_ci * temporarily by force charging it. 279662306a36Sopenharmony_ci */ 279762306a36Sopenharmony_ci page_counter_charge(&memcg->memory, nr_pages); 279862306a36Sopenharmony_ci if (do_memsw_account()) 279962306a36Sopenharmony_ci page_counter_charge(&memcg->memsw, nr_pages); 280062306a36Sopenharmony_ci 280162306a36Sopenharmony_ci return 0; 280262306a36Sopenharmony_ci 280362306a36Sopenharmony_cidone_restock: 280462306a36Sopenharmony_ci if (batch > nr_pages) 280562306a36Sopenharmony_ci refill_stock(memcg, batch - nr_pages); 280662306a36Sopenharmony_ci 280762306a36Sopenharmony_ci /* 280862306a36Sopenharmony_ci * If the hierarchy is above the normal consumption range, schedule 280962306a36Sopenharmony_ci * reclaim on returning to userland. We can perform reclaim here 281062306a36Sopenharmony_ci * if __GFP_RECLAIM but let's always punt for simplicity and so that 281162306a36Sopenharmony_ci * GFP_KERNEL can consistently be used during reclaim. @memcg is 281262306a36Sopenharmony_ci * not recorded as it most likely matches current's and won't 281362306a36Sopenharmony_ci * change in the meantime. As high limit is checked again before 281462306a36Sopenharmony_ci * reclaim, the cost of mismatch is negligible. 281562306a36Sopenharmony_ci */ 281662306a36Sopenharmony_ci do { 281762306a36Sopenharmony_ci bool mem_high, swap_high; 281862306a36Sopenharmony_ci 281962306a36Sopenharmony_ci mem_high = page_counter_read(&memcg->memory) > 282062306a36Sopenharmony_ci READ_ONCE(memcg->memory.high); 282162306a36Sopenharmony_ci swap_high = page_counter_read(&memcg->swap) > 282262306a36Sopenharmony_ci READ_ONCE(memcg->swap.high); 282362306a36Sopenharmony_ci 282462306a36Sopenharmony_ci /* Don't bother a random interrupted task */ 282562306a36Sopenharmony_ci if (!in_task()) { 282662306a36Sopenharmony_ci if (mem_high) { 282762306a36Sopenharmony_ci schedule_work(&memcg->high_work); 282862306a36Sopenharmony_ci break; 282962306a36Sopenharmony_ci } 283062306a36Sopenharmony_ci continue; 283162306a36Sopenharmony_ci } 283262306a36Sopenharmony_ci 283362306a36Sopenharmony_ci if (mem_high || swap_high) { 283462306a36Sopenharmony_ci /* 283562306a36Sopenharmony_ci * The allocating tasks in this cgroup will need to do 283662306a36Sopenharmony_ci * reclaim or be throttled to prevent further growth 283762306a36Sopenharmony_ci * of the memory or swap footprints. 283862306a36Sopenharmony_ci * 283962306a36Sopenharmony_ci * Target some best-effort fairness between the tasks, 284062306a36Sopenharmony_ci * and distribute reclaim work and delay penalties 284162306a36Sopenharmony_ci * based on how much each task is actually allocating. 284262306a36Sopenharmony_ci */ 284362306a36Sopenharmony_ci current->memcg_nr_pages_over_high += batch; 284462306a36Sopenharmony_ci set_notify_resume(current); 284562306a36Sopenharmony_ci break; 284662306a36Sopenharmony_ci } 284762306a36Sopenharmony_ci } while ((memcg = parent_mem_cgroup(memcg))); 284862306a36Sopenharmony_ci 284962306a36Sopenharmony_ci if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && 285062306a36Sopenharmony_ci !(current->flags & PF_MEMALLOC) && 285162306a36Sopenharmony_ci gfpflags_allow_blocking(gfp_mask)) { 285262306a36Sopenharmony_ci mem_cgroup_handle_over_high(gfp_mask); 285362306a36Sopenharmony_ci } 285462306a36Sopenharmony_ci return 0; 285562306a36Sopenharmony_ci} 285662306a36Sopenharmony_ci 285762306a36Sopenharmony_cistatic inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 285862306a36Sopenharmony_ci unsigned int nr_pages) 285962306a36Sopenharmony_ci{ 286062306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 286162306a36Sopenharmony_ci return 0; 286262306a36Sopenharmony_ci 286362306a36Sopenharmony_ci return try_charge_memcg(memcg, gfp_mask, nr_pages); 286462306a36Sopenharmony_ci} 286562306a36Sopenharmony_ci 286662306a36Sopenharmony_cistatic inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 286762306a36Sopenharmony_ci{ 286862306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 286962306a36Sopenharmony_ci return; 287062306a36Sopenharmony_ci 287162306a36Sopenharmony_ci page_counter_uncharge(&memcg->memory, nr_pages); 287262306a36Sopenharmony_ci if (do_memsw_account()) 287362306a36Sopenharmony_ci page_counter_uncharge(&memcg->memsw, nr_pages); 287462306a36Sopenharmony_ci} 287562306a36Sopenharmony_ci 287662306a36Sopenharmony_cistatic void commit_charge(struct folio *folio, struct mem_cgroup *memcg) 287762306a36Sopenharmony_ci{ 287862306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_memcg(folio), folio); 287962306a36Sopenharmony_ci /* 288062306a36Sopenharmony_ci * Any of the following ensures page's memcg stability: 288162306a36Sopenharmony_ci * 288262306a36Sopenharmony_ci * - the page lock 288362306a36Sopenharmony_ci * - LRU isolation 288462306a36Sopenharmony_ci * - folio_memcg_lock() 288562306a36Sopenharmony_ci * - exclusive reference 288662306a36Sopenharmony_ci * - mem_cgroup_trylock_pages() 288762306a36Sopenharmony_ci */ 288862306a36Sopenharmony_ci folio->memcg_data = (unsigned long)memcg; 288962306a36Sopenharmony_ci} 289062306a36Sopenharmony_ci 289162306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 289262306a36Sopenharmony_ci/* 289362306a36Sopenharmony_ci * The allocated objcg pointers array is not accounted directly. 289462306a36Sopenharmony_ci * Moreover, it should not come from DMA buffer and is not readily 289562306a36Sopenharmony_ci * reclaimable. So those GFP bits should be masked off. 289662306a36Sopenharmony_ci */ 289762306a36Sopenharmony_ci#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ 289862306a36Sopenharmony_ci __GFP_ACCOUNT | __GFP_NOFAIL) 289962306a36Sopenharmony_ci 290062306a36Sopenharmony_ci/* 290162306a36Sopenharmony_ci * mod_objcg_mlstate() may be called with irq enabled, so 290262306a36Sopenharmony_ci * mod_memcg_lruvec_state() should be used. 290362306a36Sopenharmony_ci */ 290462306a36Sopenharmony_cistatic inline void mod_objcg_mlstate(struct obj_cgroup *objcg, 290562306a36Sopenharmony_ci struct pglist_data *pgdat, 290662306a36Sopenharmony_ci enum node_stat_item idx, int nr) 290762306a36Sopenharmony_ci{ 290862306a36Sopenharmony_ci struct mem_cgroup *memcg; 290962306a36Sopenharmony_ci struct lruvec *lruvec; 291062306a36Sopenharmony_ci 291162306a36Sopenharmony_ci rcu_read_lock(); 291262306a36Sopenharmony_ci memcg = obj_cgroup_memcg(objcg); 291362306a36Sopenharmony_ci lruvec = mem_cgroup_lruvec(memcg, pgdat); 291462306a36Sopenharmony_ci mod_memcg_lruvec_state(lruvec, idx, nr); 291562306a36Sopenharmony_ci rcu_read_unlock(); 291662306a36Sopenharmony_ci} 291762306a36Sopenharmony_ci 291862306a36Sopenharmony_ciint memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, 291962306a36Sopenharmony_ci gfp_t gfp, bool new_slab) 292062306a36Sopenharmony_ci{ 292162306a36Sopenharmony_ci unsigned int objects = objs_per_slab(s, slab); 292262306a36Sopenharmony_ci unsigned long memcg_data; 292362306a36Sopenharmony_ci void *vec; 292462306a36Sopenharmony_ci 292562306a36Sopenharmony_ci gfp &= ~OBJCGS_CLEAR_MASK; 292662306a36Sopenharmony_ci vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 292762306a36Sopenharmony_ci slab_nid(slab)); 292862306a36Sopenharmony_ci if (!vec) 292962306a36Sopenharmony_ci return -ENOMEM; 293062306a36Sopenharmony_ci 293162306a36Sopenharmony_ci memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; 293262306a36Sopenharmony_ci if (new_slab) { 293362306a36Sopenharmony_ci /* 293462306a36Sopenharmony_ci * If the slab is brand new and nobody can yet access its 293562306a36Sopenharmony_ci * memcg_data, no synchronization is required and memcg_data can 293662306a36Sopenharmony_ci * be simply assigned. 293762306a36Sopenharmony_ci */ 293862306a36Sopenharmony_ci slab->memcg_data = memcg_data; 293962306a36Sopenharmony_ci } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { 294062306a36Sopenharmony_ci /* 294162306a36Sopenharmony_ci * If the slab is already in use, somebody can allocate and 294262306a36Sopenharmony_ci * assign obj_cgroups in parallel. In this case the existing 294362306a36Sopenharmony_ci * objcg vector should be reused. 294462306a36Sopenharmony_ci */ 294562306a36Sopenharmony_ci kfree(vec); 294662306a36Sopenharmony_ci return 0; 294762306a36Sopenharmony_ci } 294862306a36Sopenharmony_ci 294962306a36Sopenharmony_ci kmemleak_not_leak(vec); 295062306a36Sopenharmony_ci return 0; 295162306a36Sopenharmony_ci} 295262306a36Sopenharmony_ci 295362306a36Sopenharmony_cistatic __always_inline 295462306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 295562306a36Sopenharmony_ci{ 295662306a36Sopenharmony_ci /* 295762306a36Sopenharmony_ci * Slab objects are accounted individually, not per-page. 295862306a36Sopenharmony_ci * Memcg membership data for each individual object is saved in 295962306a36Sopenharmony_ci * slab->memcg_data. 296062306a36Sopenharmony_ci */ 296162306a36Sopenharmony_ci if (folio_test_slab(folio)) { 296262306a36Sopenharmony_ci struct obj_cgroup **objcgs; 296362306a36Sopenharmony_ci struct slab *slab; 296462306a36Sopenharmony_ci unsigned int off; 296562306a36Sopenharmony_ci 296662306a36Sopenharmony_ci slab = folio_slab(folio); 296762306a36Sopenharmony_ci objcgs = slab_objcgs(slab); 296862306a36Sopenharmony_ci if (!objcgs) 296962306a36Sopenharmony_ci return NULL; 297062306a36Sopenharmony_ci 297162306a36Sopenharmony_ci off = obj_to_index(slab->slab_cache, slab, p); 297262306a36Sopenharmony_ci if (objcgs[off]) 297362306a36Sopenharmony_ci return obj_cgroup_memcg(objcgs[off]); 297462306a36Sopenharmony_ci 297562306a36Sopenharmony_ci return NULL; 297662306a36Sopenharmony_ci } 297762306a36Sopenharmony_ci 297862306a36Sopenharmony_ci /* 297962306a36Sopenharmony_ci * folio_memcg_check() is used here, because in theory we can encounter 298062306a36Sopenharmony_ci * a folio where the slab flag has been cleared already, but 298162306a36Sopenharmony_ci * slab->memcg_data has not been freed yet 298262306a36Sopenharmony_ci * folio_memcg_check() will guarantee that a proper memory 298362306a36Sopenharmony_ci * cgroup pointer or NULL will be returned. 298462306a36Sopenharmony_ci */ 298562306a36Sopenharmony_ci return folio_memcg_check(folio); 298662306a36Sopenharmony_ci} 298762306a36Sopenharmony_ci 298862306a36Sopenharmony_ci/* 298962306a36Sopenharmony_ci * Returns a pointer to the memory cgroup to which the kernel object is charged. 299062306a36Sopenharmony_ci * 299162306a36Sopenharmony_ci * A passed kernel object can be a slab object, vmalloc object or a generic 299262306a36Sopenharmony_ci * kernel page, so different mechanisms for getting the memory cgroup pointer 299362306a36Sopenharmony_ci * should be used. 299462306a36Sopenharmony_ci * 299562306a36Sopenharmony_ci * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 299662306a36Sopenharmony_ci * can not know for sure how the kernel object is implemented. 299762306a36Sopenharmony_ci * mem_cgroup_from_obj() can be safely used in such cases. 299862306a36Sopenharmony_ci * 299962306a36Sopenharmony_ci * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 300062306a36Sopenharmony_ci * cgroup_mutex, etc. 300162306a36Sopenharmony_ci */ 300262306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_obj(void *p) 300362306a36Sopenharmony_ci{ 300462306a36Sopenharmony_ci struct folio *folio; 300562306a36Sopenharmony_ci 300662306a36Sopenharmony_ci if (mem_cgroup_disabled()) 300762306a36Sopenharmony_ci return NULL; 300862306a36Sopenharmony_ci 300962306a36Sopenharmony_ci if (unlikely(is_vmalloc_addr(p))) 301062306a36Sopenharmony_ci folio = page_folio(vmalloc_to_page(p)); 301162306a36Sopenharmony_ci else 301262306a36Sopenharmony_ci folio = virt_to_folio(p); 301362306a36Sopenharmony_ci 301462306a36Sopenharmony_ci return mem_cgroup_from_obj_folio(folio, p); 301562306a36Sopenharmony_ci} 301662306a36Sopenharmony_ci 301762306a36Sopenharmony_ci/* 301862306a36Sopenharmony_ci * Returns a pointer to the memory cgroup to which the kernel object is charged. 301962306a36Sopenharmony_ci * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, 302062306a36Sopenharmony_ci * allocated using vmalloc(). 302162306a36Sopenharmony_ci * 302262306a36Sopenharmony_ci * A passed kernel object must be a slab object or a generic kernel page. 302362306a36Sopenharmony_ci * 302462306a36Sopenharmony_ci * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 302562306a36Sopenharmony_ci * cgroup_mutex, etc. 302662306a36Sopenharmony_ci */ 302762306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 302862306a36Sopenharmony_ci{ 302962306a36Sopenharmony_ci if (mem_cgroup_disabled()) 303062306a36Sopenharmony_ci return NULL; 303162306a36Sopenharmony_ci 303262306a36Sopenharmony_ci return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 303362306a36Sopenharmony_ci} 303462306a36Sopenharmony_ci 303562306a36Sopenharmony_cistatic struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) 303662306a36Sopenharmony_ci{ 303762306a36Sopenharmony_ci struct obj_cgroup *objcg = NULL; 303862306a36Sopenharmony_ci 303962306a36Sopenharmony_ci for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 304062306a36Sopenharmony_ci objcg = rcu_dereference(memcg->objcg); 304162306a36Sopenharmony_ci if (objcg && obj_cgroup_tryget(objcg)) 304262306a36Sopenharmony_ci break; 304362306a36Sopenharmony_ci objcg = NULL; 304462306a36Sopenharmony_ci } 304562306a36Sopenharmony_ci return objcg; 304662306a36Sopenharmony_ci} 304762306a36Sopenharmony_ci 304862306a36Sopenharmony_ci__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 304962306a36Sopenharmony_ci{ 305062306a36Sopenharmony_ci struct obj_cgroup *objcg = NULL; 305162306a36Sopenharmony_ci struct mem_cgroup *memcg; 305262306a36Sopenharmony_ci 305362306a36Sopenharmony_ci if (memcg_kmem_bypass()) 305462306a36Sopenharmony_ci return NULL; 305562306a36Sopenharmony_ci 305662306a36Sopenharmony_ci rcu_read_lock(); 305762306a36Sopenharmony_ci if (unlikely(active_memcg())) 305862306a36Sopenharmony_ci memcg = active_memcg(); 305962306a36Sopenharmony_ci else 306062306a36Sopenharmony_ci memcg = mem_cgroup_from_task(current); 306162306a36Sopenharmony_ci objcg = __get_obj_cgroup_from_memcg(memcg); 306262306a36Sopenharmony_ci rcu_read_unlock(); 306362306a36Sopenharmony_ci return objcg; 306462306a36Sopenharmony_ci} 306562306a36Sopenharmony_ci 306662306a36Sopenharmony_cistruct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) 306762306a36Sopenharmony_ci{ 306862306a36Sopenharmony_ci struct obj_cgroup *objcg; 306962306a36Sopenharmony_ci 307062306a36Sopenharmony_ci if (!memcg_kmem_online()) 307162306a36Sopenharmony_ci return NULL; 307262306a36Sopenharmony_ci 307362306a36Sopenharmony_ci if (folio_memcg_kmem(folio)) { 307462306a36Sopenharmony_ci objcg = __folio_objcg(folio); 307562306a36Sopenharmony_ci obj_cgroup_get(objcg); 307662306a36Sopenharmony_ci } else { 307762306a36Sopenharmony_ci struct mem_cgroup *memcg; 307862306a36Sopenharmony_ci 307962306a36Sopenharmony_ci rcu_read_lock(); 308062306a36Sopenharmony_ci memcg = __folio_memcg(folio); 308162306a36Sopenharmony_ci if (memcg) 308262306a36Sopenharmony_ci objcg = __get_obj_cgroup_from_memcg(memcg); 308362306a36Sopenharmony_ci else 308462306a36Sopenharmony_ci objcg = NULL; 308562306a36Sopenharmony_ci rcu_read_unlock(); 308662306a36Sopenharmony_ci } 308762306a36Sopenharmony_ci return objcg; 308862306a36Sopenharmony_ci} 308962306a36Sopenharmony_ci 309062306a36Sopenharmony_cistatic void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 309162306a36Sopenharmony_ci{ 309262306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); 309362306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 309462306a36Sopenharmony_ci if (nr_pages > 0) 309562306a36Sopenharmony_ci page_counter_charge(&memcg->kmem, nr_pages); 309662306a36Sopenharmony_ci else 309762306a36Sopenharmony_ci page_counter_uncharge(&memcg->kmem, -nr_pages); 309862306a36Sopenharmony_ci } 309962306a36Sopenharmony_ci} 310062306a36Sopenharmony_ci 310162306a36Sopenharmony_ci 310262306a36Sopenharmony_ci/* 310362306a36Sopenharmony_ci * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 310462306a36Sopenharmony_ci * @objcg: object cgroup to uncharge 310562306a36Sopenharmony_ci * @nr_pages: number of pages to uncharge 310662306a36Sopenharmony_ci */ 310762306a36Sopenharmony_cistatic void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 310862306a36Sopenharmony_ci unsigned int nr_pages) 310962306a36Sopenharmony_ci{ 311062306a36Sopenharmony_ci struct mem_cgroup *memcg; 311162306a36Sopenharmony_ci 311262306a36Sopenharmony_ci memcg = get_mem_cgroup_from_objcg(objcg); 311362306a36Sopenharmony_ci 311462306a36Sopenharmony_ci memcg_account_kmem(memcg, -nr_pages); 311562306a36Sopenharmony_ci refill_stock(memcg, nr_pages); 311662306a36Sopenharmony_ci 311762306a36Sopenharmony_ci css_put(&memcg->css); 311862306a36Sopenharmony_ci} 311962306a36Sopenharmony_ci 312062306a36Sopenharmony_ci/* 312162306a36Sopenharmony_ci * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 312262306a36Sopenharmony_ci * @objcg: object cgroup to charge 312362306a36Sopenharmony_ci * @gfp: reclaim mode 312462306a36Sopenharmony_ci * @nr_pages: number of pages to charge 312562306a36Sopenharmony_ci * 312662306a36Sopenharmony_ci * Returns 0 on success, an error code on failure. 312762306a36Sopenharmony_ci */ 312862306a36Sopenharmony_cistatic int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 312962306a36Sopenharmony_ci unsigned int nr_pages) 313062306a36Sopenharmony_ci{ 313162306a36Sopenharmony_ci struct mem_cgroup *memcg; 313262306a36Sopenharmony_ci int ret; 313362306a36Sopenharmony_ci 313462306a36Sopenharmony_ci memcg = get_mem_cgroup_from_objcg(objcg); 313562306a36Sopenharmony_ci 313662306a36Sopenharmony_ci ret = try_charge_memcg(memcg, gfp, nr_pages); 313762306a36Sopenharmony_ci if (ret) 313862306a36Sopenharmony_ci goto out; 313962306a36Sopenharmony_ci 314062306a36Sopenharmony_ci memcg_account_kmem(memcg, nr_pages); 314162306a36Sopenharmony_ciout: 314262306a36Sopenharmony_ci css_put(&memcg->css); 314362306a36Sopenharmony_ci 314462306a36Sopenharmony_ci return ret; 314562306a36Sopenharmony_ci} 314662306a36Sopenharmony_ci 314762306a36Sopenharmony_ci/** 314862306a36Sopenharmony_ci * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 314962306a36Sopenharmony_ci * @page: page to charge 315062306a36Sopenharmony_ci * @gfp: reclaim mode 315162306a36Sopenharmony_ci * @order: allocation order 315262306a36Sopenharmony_ci * 315362306a36Sopenharmony_ci * Returns 0 on success, an error code on failure. 315462306a36Sopenharmony_ci */ 315562306a36Sopenharmony_ciint __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 315662306a36Sopenharmony_ci{ 315762306a36Sopenharmony_ci struct obj_cgroup *objcg; 315862306a36Sopenharmony_ci int ret = 0; 315962306a36Sopenharmony_ci 316062306a36Sopenharmony_ci objcg = get_obj_cgroup_from_current(); 316162306a36Sopenharmony_ci if (objcg) { 316262306a36Sopenharmony_ci ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 316362306a36Sopenharmony_ci if (!ret) { 316462306a36Sopenharmony_ci page->memcg_data = (unsigned long)objcg | 316562306a36Sopenharmony_ci MEMCG_DATA_KMEM; 316662306a36Sopenharmony_ci return 0; 316762306a36Sopenharmony_ci } 316862306a36Sopenharmony_ci obj_cgroup_put(objcg); 316962306a36Sopenharmony_ci } 317062306a36Sopenharmony_ci return ret; 317162306a36Sopenharmony_ci} 317262306a36Sopenharmony_ci 317362306a36Sopenharmony_ci/** 317462306a36Sopenharmony_ci * __memcg_kmem_uncharge_page: uncharge a kmem page 317562306a36Sopenharmony_ci * @page: page to uncharge 317662306a36Sopenharmony_ci * @order: allocation order 317762306a36Sopenharmony_ci */ 317862306a36Sopenharmony_civoid __memcg_kmem_uncharge_page(struct page *page, int order) 317962306a36Sopenharmony_ci{ 318062306a36Sopenharmony_ci struct folio *folio = page_folio(page); 318162306a36Sopenharmony_ci struct obj_cgroup *objcg; 318262306a36Sopenharmony_ci unsigned int nr_pages = 1 << order; 318362306a36Sopenharmony_ci 318462306a36Sopenharmony_ci if (!folio_memcg_kmem(folio)) 318562306a36Sopenharmony_ci return; 318662306a36Sopenharmony_ci 318762306a36Sopenharmony_ci objcg = __folio_objcg(folio); 318862306a36Sopenharmony_ci obj_cgroup_uncharge_pages(objcg, nr_pages); 318962306a36Sopenharmony_ci folio->memcg_data = 0; 319062306a36Sopenharmony_ci obj_cgroup_put(objcg); 319162306a36Sopenharmony_ci} 319262306a36Sopenharmony_ci 319362306a36Sopenharmony_civoid mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 319462306a36Sopenharmony_ci enum node_stat_item idx, int nr) 319562306a36Sopenharmony_ci{ 319662306a36Sopenharmony_ci struct memcg_stock_pcp *stock; 319762306a36Sopenharmony_ci struct obj_cgroup *old = NULL; 319862306a36Sopenharmony_ci unsigned long flags; 319962306a36Sopenharmony_ci int *bytes; 320062306a36Sopenharmony_ci 320162306a36Sopenharmony_ci local_lock_irqsave(&memcg_stock.stock_lock, flags); 320262306a36Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 320362306a36Sopenharmony_ci 320462306a36Sopenharmony_ci /* 320562306a36Sopenharmony_ci * Save vmstat data in stock and skip vmstat array update unless 320662306a36Sopenharmony_ci * accumulating over a page of vmstat data or when pgdat or idx 320762306a36Sopenharmony_ci * changes. 320862306a36Sopenharmony_ci */ 320962306a36Sopenharmony_ci if (READ_ONCE(stock->cached_objcg) != objcg) { 321062306a36Sopenharmony_ci old = drain_obj_stock(stock); 321162306a36Sopenharmony_ci obj_cgroup_get(objcg); 321262306a36Sopenharmony_ci stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 321362306a36Sopenharmony_ci ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 321462306a36Sopenharmony_ci WRITE_ONCE(stock->cached_objcg, objcg); 321562306a36Sopenharmony_ci stock->cached_pgdat = pgdat; 321662306a36Sopenharmony_ci } else if (stock->cached_pgdat != pgdat) { 321762306a36Sopenharmony_ci /* Flush the existing cached vmstat data */ 321862306a36Sopenharmony_ci struct pglist_data *oldpg = stock->cached_pgdat; 321962306a36Sopenharmony_ci 322062306a36Sopenharmony_ci if (stock->nr_slab_reclaimable_b) { 322162306a36Sopenharmony_ci mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, 322262306a36Sopenharmony_ci stock->nr_slab_reclaimable_b); 322362306a36Sopenharmony_ci stock->nr_slab_reclaimable_b = 0; 322462306a36Sopenharmony_ci } 322562306a36Sopenharmony_ci if (stock->nr_slab_unreclaimable_b) { 322662306a36Sopenharmony_ci mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, 322762306a36Sopenharmony_ci stock->nr_slab_unreclaimable_b); 322862306a36Sopenharmony_ci stock->nr_slab_unreclaimable_b = 0; 322962306a36Sopenharmony_ci } 323062306a36Sopenharmony_ci stock->cached_pgdat = pgdat; 323162306a36Sopenharmony_ci } 323262306a36Sopenharmony_ci 323362306a36Sopenharmony_ci bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b 323462306a36Sopenharmony_ci : &stock->nr_slab_unreclaimable_b; 323562306a36Sopenharmony_ci /* 323662306a36Sopenharmony_ci * Even for large object >= PAGE_SIZE, the vmstat data will still be 323762306a36Sopenharmony_ci * cached locally at least once before pushing it out. 323862306a36Sopenharmony_ci */ 323962306a36Sopenharmony_ci if (!*bytes) { 324062306a36Sopenharmony_ci *bytes = nr; 324162306a36Sopenharmony_ci nr = 0; 324262306a36Sopenharmony_ci } else { 324362306a36Sopenharmony_ci *bytes += nr; 324462306a36Sopenharmony_ci if (abs(*bytes) > PAGE_SIZE) { 324562306a36Sopenharmony_ci nr = *bytes; 324662306a36Sopenharmony_ci *bytes = 0; 324762306a36Sopenharmony_ci } else { 324862306a36Sopenharmony_ci nr = 0; 324962306a36Sopenharmony_ci } 325062306a36Sopenharmony_ci } 325162306a36Sopenharmony_ci if (nr) 325262306a36Sopenharmony_ci mod_objcg_mlstate(objcg, pgdat, idx, nr); 325362306a36Sopenharmony_ci 325462306a36Sopenharmony_ci local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 325562306a36Sopenharmony_ci if (old) 325662306a36Sopenharmony_ci obj_cgroup_put(old); 325762306a36Sopenharmony_ci} 325862306a36Sopenharmony_ci 325962306a36Sopenharmony_cistatic bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 326062306a36Sopenharmony_ci{ 326162306a36Sopenharmony_ci struct memcg_stock_pcp *stock; 326262306a36Sopenharmony_ci unsigned long flags; 326362306a36Sopenharmony_ci bool ret = false; 326462306a36Sopenharmony_ci 326562306a36Sopenharmony_ci local_lock_irqsave(&memcg_stock.stock_lock, flags); 326662306a36Sopenharmony_ci 326762306a36Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 326862306a36Sopenharmony_ci if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { 326962306a36Sopenharmony_ci stock->nr_bytes -= nr_bytes; 327062306a36Sopenharmony_ci ret = true; 327162306a36Sopenharmony_ci } 327262306a36Sopenharmony_ci 327362306a36Sopenharmony_ci local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 327462306a36Sopenharmony_ci 327562306a36Sopenharmony_ci return ret; 327662306a36Sopenharmony_ci} 327762306a36Sopenharmony_ci 327862306a36Sopenharmony_cistatic struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 327962306a36Sopenharmony_ci{ 328062306a36Sopenharmony_ci struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); 328162306a36Sopenharmony_ci 328262306a36Sopenharmony_ci if (!old) 328362306a36Sopenharmony_ci return NULL; 328462306a36Sopenharmony_ci 328562306a36Sopenharmony_ci if (stock->nr_bytes) { 328662306a36Sopenharmony_ci unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 328762306a36Sopenharmony_ci unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 328862306a36Sopenharmony_ci 328962306a36Sopenharmony_ci if (nr_pages) { 329062306a36Sopenharmony_ci struct mem_cgroup *memcg; 329162306a36Sopenharmony_ci 329262306a36Sopenharmony_ci memcg = get_mem_cgroup_from_objcg(old); 329362306a36Sopenharmony_ci 329462306a36Sopenharmony_ci memcg_account_kmem(memcg, -nr_pages); 329562306a36Sopenharmony_ci __refill_stock(memcg, nr_pages); 329662306a36Sopenharmony_ci 329762306a36Sopenharmony_ci css_put(&memcg->css); 329862306a36Sopenharmony_ci } 329962306a36Sopenharmony_ci 330062306a36Sopenharmony_ci /* 330162306a36Sopenharmony_ci * The leftover is flushed to the centralized per-memcg value. 330262306a36Sopenharmony_ci * On the next attempt to refill obj stock it will be moved 330362306a36Sopenharmony_ci * to a per-cpu stock (probably, on an other CPU), see 330462306a36Sopenharmony_ci * refill_obj_stock(). 330562306a36Sopenharmony_ci * 330662306a36Sopenharmony_ci * How often it's flushed is a trade-off between the memory 330762306a36Sopenharmony_ci * limit enforcement accuracy and potential CPU contention, 330862306a36Sopenharmony_ci * so it might be changed in the future. 330962306a36Sopenharmony_ci */ 331062306a36Sopenharmony_ci atomic_add(nr_bytes, &old->nr_charged_bytes); 331162306a36Sopenharmony_ci stock->nr_bytes = 0; 331262306a36Sopenharmony_ci } 331362306a36Sopenharmony_ci 331462306a36Sopenharmony_ci /* 331562306a36Sopenharmony_ci * Flush the vmstat data in current stock 331662306a36Sopenharmony_ci */ 331762306a36Sopenharmony_ci if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { 331862306a36Sopenharmony_ci if (stock->nr_slab_reclaimable_b) { 331962306a36Sopenharmony_ci mod_objcg_mlstate(old, stock->cached_pgdat, 332062306a36Sopenharmony_ci NR_SLAB_RECLAIMABLE_B, 332162306a36Sopenharmony_ci stock->nr_slab_reclaimable_b); 332262306a36Sopenharmony_ci stock->nr_slab_reclaimable_b = 0; 332362306a36Sopenharmony_ci } 332462306a36Sopenharmony_ci if (stock->nr_slab_unreclaimable_b) { 332562306a36Sopenharmony_ci mod_objcg_mlstate(old, stock->cached_pgdat, 332662306a36Sopenharmony_ci NR_SLAB_UNRECLAIMABLE_B, 332762306a36Sopenharmony_ci stock->nr_slab_unreclaimable_b); 332862306a36Sopenharmony_ci stock->nr_slab_unreclaimable_b = 0; 332962306a36Sopenharmony_ci } 333062306a36Sopenharmony_ci stock->cached_pgdat = NULL; 333162306a36Sopenharmony_ci } 333262306a36Sopenharmony_ci 333362306a36Sopenharmony_ci WRITE_ONCE(stock->cached_objcg, NULL); 333462306a36Sopenharmony_ci /* 333562306a36Sopenharmony_ci * The `old' objects needs to be released by the caller via 333662306a36Sopenharmony_ci * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. 333762306a36Sopenharmony_ci */ 333862306a36Sopenharmony_ci return old; 333962306a36Sopenharmony_ci} 334062306a36Sopenharmony_ci 334162306a36Sopenharmony_cistatic bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 334262306a36Sopenharmony_ci struct mem_cgroup *root_memcg) 334362306a36Sopenharmony_ci{ 334462306a36Sopenharmony_ci struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); 334562306a36Sopenharmony_ci struct mem_cgroup *memcg; 334662306a36Sopenharmony_ci 334762306a36Sopenharmony_ci if (objcg) { 334862306a36Sopenharmony_ci memcg = obj_cgroup_memcg(objcg); 334962306a36Sopenharmony_ci if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 335062306a36Sopenharmony_ci return true; 335162306a36Sopenharmony_ci } 335262306a36Sopenharmony_ci 335362306a36Sopenharmony_ci return false; 335462306a36Sopenharmony_ci} 335562306a36Sopenharmony_ci 335662306a36Sopenharmony_cistatic void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 335762306a36Sopenharmony_ci bool allow_uncharge) 335862306a36Sopenharmony_ci{ 335962306a36Sopenharmony_ci struct memcg_stock_pcp *stock; 336062306a36Sopenharmony_ci struct obj_cgroup *old = NULL; 336162306a36Sopenharmony_ci unsigned long flags; 336262306a36Sopenharmony_ci unsigned int nr_pages = 0; 336362306a36Sopenharmony_ci 336462306a36Sopenharmony_ci local_lock_irqsave(&memcg_stock.stock_lock, flags); 336562306a36Sopenharmony_ci 336662306a36Sopenharmony_ci stock = this_cpu_ptr(&memcg_stock); 336762306a36Sopenharmony_ci if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ 336862306a36Sopenharmony_ci old = drain_obj_stock(stock); 336962306a36Sopenharmony_ci obj_cgroup_get(objcg); 337062306a36Sopenharmony_ci WRITE_ONCE(stock->cached_objcg, objcg); 337162306a36Sopenharmony_ci stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 337262306a36Sopenharmony_ci ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 337362306a36Sopenharmony_ci allow_uncharge = true; /* Allow uncharge when objcg changes */ 337462306a36Sopenharmony_ci } 337562306a36Sopenharmony_ci stock->nr_bytes += nr_bytes; 337662306a36Sopenharmony_ci 337762306a36Sopenharmony_ci if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 337862306a36Sopenharmony_ci nr_pages = stock->nr_bytes >> PAGE_SHIFT; 337962306a36Sopenharmony_ci stock->nr_bytes &= (PAGE_SIZE - 1); 338062306a36Sopenharmony_ci } 338162306a36Sopenharmony_ci 338262306a36Sopenharmony_ci local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 338362306a36Sopenharmony_ci if (old) 338462306a36Sopenharmony_ci obj_cgroup_put(old); 338562306a36Sopenharmony_ci 338662306a36Sopenharmony_ci if (nr_pages) 338762306a36Sopenharmony_ci obj_cgroup_uncharge_pages(objcg, nr_pages); 338862306a36Sopenharmony_ci} 338962306a36Sopenharmony_ci 339062306a36Sopenharmony_ciint obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 339162306a36Sopenharmony_ci{ 339262306a36Sopenharmony_ci unsigned int nr_pages, nr_bytes; 339362306a36Sopenharmony_ci int ret; 339462306a36Sopenharmony_ci 339562306a36Sopenharmony_ci if (consume_obj_stock(objcg, size)) 339662306a36Sopenharmony_ci return 0; 339762306a36Sopenharmony_ci 339862306a36Sopenharmony_ci /* 339962306a36Sopenharmony_ci * In theory, objcg->nr_charged_bytes can have enough 340062306a36Sopenharmony_ci * pre-charged bytes to satisfy the allocation. However, 340162306a36Sopenharmony_ci * flushing objcg->nr_charged_bytes requires two atomic 340262306a36Sopenharmony_ci * operations, and objcg->nr_charged_bytes can't be big. 340362306a36Sopenharmony_ci * The shared objcg->nr_charged_bytes can also become a 340462306a36Sopenharmony_ci * performance bottleneck if all tasks of the same memcg are 340562306a36Sopenharmony_ci * trying to update it. So it's better to ignore it and try 340662306a36Sopenharmony_ci * grab some new pages. The stock's nr_bytes will be flushed to 340762306a36Sopenharmony_ci * objcg->nr_charged_bytes later on when objcg changes. 340862306a36Sopenharmony_ci * 340962306a36Sopenharmony_ci * The stock's nr_bytes may contain enough pre-charged bytes 341062306a36Sopenharmony_ci * to allow one less page from being charged, but we can't rely 341162306a36Sopenharmony_ci * on the pre-charged bytes not being changed outside of 341262306a36Sopenharmony_ci * consume_obj_stock() or refill_obj_stock(). So ignore those 341362306a36Sopenharmony_ci * pre-charged bytes as well when charging pages. To avoid a 341462306a36Sopenharmony_ci * page uncharge right after a page charge, we set the 341562306a36Sopenharmony_ci * allow_uncharge flag to false when calling refill_obj_stock() 341662306a36Sopenharmony_ci * to temporarily allow the pre-charged bytes to exceed the page 341762306a36Sopenharmony_ci * size limit. The maximum reachable value of the pre-charged 341862306a36Sopenharmony_ci * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data 341962306a36Sopenharmony_ci * race. 342062306a36Sopenharmony_ci */ 342162306a36Sopenharmony_ci nr_pages = size >> PAGE_SHIFT; 342262306a36Sopenharmony_ci nr_bytes = size & (PAGE_SIZE - 1); 342362306a36Sopenharmony_ci 342462306a36Sopenharmony_ci if (nr_bytes) 342562306a36Sopenharmony_ci nr_pages += 1; 342662306a36Sopenharmony_ci 342762306a36Sopenharmony_ci ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 342862306a36Sopenharmony_ci if (!ret && nr_bytes) 342962306a36Sopenharmony_ci refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 343062306a36Sopenharmony_ci 343162306a36Sopenharmony_ci return ret; 343262306a36Sopenharmony_ci} 343362306a36Sopenharmony_ci 343462306a36Sopenharmony_civoid obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 343562306a36Sopenharmony_ci{ 343662306a36Sopenharmony_ci refill_obj_stock(objcg, size, true); 343762306a36Sopenharmony_ci} 343862306a36Sopenharmony_ci 343962306a36Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */ 344062306a36Sopenharmony_ci 344162306a36Sopenharmony_ci/* 344262306a36Sopenharmony_ci * Because page_memcg(head) is not set on tails, set it now. 344362306a36Sopenharmony_ci */ 344462306a36Sopenharmony_civoid split_page_memcg(struct page *head, unsigned int nr) 344562306a36Sopenharmony_ci{ 344662306a36Sopenharmony_ci struct folio *folio = page_folio(head); 344762306a36Sopenharmony_ci struct mem_cgroup *memcg = folio_memcg(folio); 344862306a36Sopenharmony_ci int i; 344962306a36Sopenharmony_ci 345062306a36Sopenharmony_ci if (mem_cgroup_disabled() || !memcg) 345162306a36Sopenharmony_ci return; 345262306a36Sopenharmony_ci 345362306a36Sopenharmony_ci for (i = 1; i < nr; i++) 345462306a36Sopenharmony_ci folio_page(folio, i)->memcg_data = folio->memcg_data; 345562306a36Sopenharmony_ci 345662306a36Sopenharmony_ci if (folio_memcg_kmem(folio)) 345762306a36Sopenharmony_ci obj_cgroup_get_many(__folio_objcg(folio), nr - 1); 345862306a36Sopenharmony_ci else 345962306a36Sopenharmony_ci css_get_many(&memcg->css, nr - 1); 346062306a36Sopenharmony_ci} 346162306a36Sopenharmony_ci 346262306a36Sopenharmony_ci#ifdef CONFIG_SWAP 346362306a36Sopenharmony_ci/** 346462306a36Sopenharmony_ci * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 346562306a36Sopenharmony_ci * @entry: swap entry to be moved 346662306a36Sopenharmony_ci * @from: mem_cgroup which the entry is moved from 346762306a36Sopenharmony_ci * @to: mem_cgroup which the entry is moved to 346862306a36Sopenharmony_ci * 346962306a36Sopenharmony_ci * It succeeds only when the swap_cgroup's record for this entry is the same 347062306a36Sopenharmony_ci * as the mem_cgroup's id of @from. 347162306a36Sopenharmony_ci * 347262306a36Sopenharmony_ci * Returns 0 on success, -EINVAL on failure. 347362306a36Sopenharmony_ci * 347462306a36Sopenharmony_ci * The caller must have charged to @to, IOW, called page_counter_charge() about 347562306a36Sopenharmony_ci * both res and memsw, and called css_get(). 347662306a36Sopenharmony_ci */ 347762306a36Sopenharmony_cistatic int mem_cgroup_move_swap_account(swp_entry_t entry, 347862306a36Sopenharmony_ci struct mem_cgroup *from, struct mem_cgroup *to) 347962306a36Sopenharmony_ci{ 348062306a36Sopenharmony_ci unsigned short old_id, new_id; 348162306a36Sopenharmony_ci 348262306a36Sopenharmony_ci old_id = mem_cgroup_id(from); 348362306a36Sopenharmony_ci new_id = mem_cgroup_id(to); 348462306a36Sopenharmony_ci 348562306a36Sopenharmony_ci if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 348662306a36Sopenharmony_ci mod_memcg_state(from, MEMCG_SWAP, -1); 348762306a36Sopenharmony_ci mod_memcg_state(to, MEMCG_SWAP, 1); 348862306a36Sopenharmony_ci return 0; 348962306a36Sopenharmony_ci } 349062306a36Sopenharmony_ci return -EINVAL; 349162306a36Sopenharmony_ci} 349262306a36Sopenharmony_ci#else 349362306a36Sopenharmony_cistatic inline int mem_cgroup_move_swap_account(swp_entry_t entry, 349462306a36Sopenharmony_ci struct mem_cgroup *from, struct mem_cgroup *to) 349562306a36Sopenharmony_ci{ 349662306a36Sopenharmony_ci return -EINVAL; 349762306a36Sopenharmony_ci} 349862306a36Sopenharmony_ci#endif 349962306a36Sopenharmony_ci 350062306a36Sopenharmony_cistatic DEFINE_MUTEX(memcg_max_mutex); 350162306a36Sopenharmony_ci 350262306a36Sopenharmony_cistatic int mem_cgroup_resize_max(struct mem_cgroup *memcg, 350362306a36Sopenharmony_ci unsigned long max, bool memsw) 350462306a36Sopenharmony_ci{ 350562306a36Sopenharmony_ci bool enlarge = false; 350662306a36Sopenharmony_ci bool drained = false; 350762306a36Sopenharmony_ci int ret; 350862306a36Sopenharmony_ci bool limits_invariant; 350962306a36Sopenharmony_ci struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 351062306a36Sopenharmony_ci 351162306a36Sopenharmony_ci do { 351262306a36Sopenharmony_ci if (signal_pending(current)) { 351362306a36Sopenharmony_ci ret = -EINTR; 351462306a36Sopenharmony_ci break; 351562306a36Sopenharmony_ci } 351662306a36Sopenharmony_ci 351762306a36Sopenharmony_ci mutex_lock(&memcg_max_mutex); 351862306a36Sopenharmony_ci /* 351962306a36Sopenharmony_ci * Make sure that the new limit (memsw or memory limit) doesn't 352062306a36Sopenharmony_ci * break our basic invariant rule memory.max <= memsw.max. 352162306a36Sopenharmony_ci */ 352262306a36Sopenharmony_ci limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 352362306a36Sopenharmony_ci max <= memcg->memsw.max; 352462306a36Sopenharmony_ci if (!limits_invariant) { 352562306a36Sopenharmony_ci mutex_unlock(&memcg_max_mutex); 352662306a36Sopenharmony_ci ret = -EINVAL; 352762306a36Sopenharmony_ci break; 352862306a36Sopenharmony_ci } 352962306a36Sopenharmony_ci if (max > counter->max) 353062306a36Sopenharmony_ci enlarge = true; 353162306a36Sopenharmony_ci ret = page_counter_set_max(counter, max); 353262306a36Sopenharmony_ci mutex_unlock(&memcg_max_mutex); 353362306a36Sopenharmony_ci 353462306a36Sopenharmony_ci if (!ret) 353562306a36Sopenharmony_ci break; 353662306a36Sopenharmony_ci 353762306a36Sopenharmony_ci if (!drained) { 353862306a36Sopenharmony_ci drain_all_stock(memcg); 353962306a36Sopenharmony_ci drained = true; 354062306a36Sopenharmony_ci continue; 354162306a36Sopenharmony_ci } 354262306a36Sopenharmony_ci 354362306a36Sopenharmony_ci if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 354462306a36Sopenharmony_ci memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { 354562306a36Sopenharmony_ci ret = -EBUSY; 354662306a36Sopenharmony_ci break; 354762306a36Sopenharmony_ci } 354862306a36Sopenharmony_ci } while (true); 354962306a36Sopenharmony_ci 355062306a36Sopenharmony_ci if (!ret && enlarge) 355162306a36Sopenharmony_ci memcg_oom_recover(memcg); 355262306a36Sopenharmony_ci 355362306a36Sopenharmony_ci return ret; 355462306a36Sopenharmony_ci} 355562306a36Sopenharmony_ci 355662306a36Sopenharmony_ciunsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 355762306a36Sopenharmony_ci gfp_t gfp_mask, 355862306a36Sopenharmony_ci unsigned long *total_scanned) 355962306a36Sopenharmony_ci{ 356062306a36Sopenharmony_ci unsigned long nr_reclaimed = 0; 356162306a36Sopenharmony_ci struct mem_cgroup_per_node *mz, *next_mz = NULL; 356262306a36Sopenharmony_ci unsigned long reclaimed; 356362306a36Sopenharmony_ci int loop = 0; 356462306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *mctz; 356562306a36Sopenharmony_ci unsigned long excess; 356662306a36Sopenharmony_ci 356762306a36Sopenharmony_ci if (lru_gen_enabled()) 356862306a36Sopenharmony_ci return 0; 356962306a36Sopenharmony_ci 357062306a36Sopenharmony_ci if (order > 0) 357162306a36Sopenharmony_ci return 0; 357262306a36Sopenharmony_ci 357362306a36Sopenharmony_ci mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 357462306a36Sopenharmony_ci 357562306a36Sopenharmony_ci /* 357662306a36Sopenharmony_ci * Do not even bother to check the largest node if the root 357762306a36Sopenharmony_ci * is empty. Do it lockless to prevent lock bouncing. Races 357862306a36Sopenharmony_ci * are acceptable as soft limit is best effort anyway. 357962306a36Sopenharmony_ci */ 358062306a36Sopenharmony_ci if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 358162306a36Sopenharmony_ci return 0; 358262306a36Sopenharmony_ci 358362306a36Sopenharmony_ci /* 358462306a36Sopenharmony_ci * This loop can run a while, specially if mem_cgroup's continuously 358562306a36Sopenharmony_ci * keep exceeding their soft limit and putting the system under 358662306a36Sopenharmony_ci * pressure 358762306a36Sopenharmony_ci */ 358862306a36Sopenharmony_ci do { 358962306a36Sopenharmony_ci if (next_mz) 359062306a36Sopenharmony_ci mz = next_mz; 359162306a36Sopenharmony_ci else 359262306a36Sopenharmony_ci mz = mem_cgroup_largest_soft_limit_node(mctz); 359362306a36Sopenharmony_ci if (!mz) 359462306a36Sopenharmony_ci break; 359562306a36Sopenharmony_ci 359662306a36Sopenharmony_ci reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 359762306a36Sopenharmony_ci gfp_mask, total_scanned); 359862306a36Sopenharmony_ci nr_reclaimed += reclaimed; 359962306a36Sopenharmony_ci spin_lock_irq(&mctz->lock); 360062306a36Sopenharmony_ci 360162306a36Sopenharmony_ci /* 360262306a36Sopenharmony_ci * If we failed to reclaim anything from this memory cgroup 360362306a36Sopenharmony_ci * it is time to move on to the next cgroup 360462306a36Sopenharmony_ci */ 360562306a36Sopenharmony_ci next_mz = NULL; 360662306a36Sopenharmony_ci if (!reclaimed) 360762306a36Sopenharmony_ci next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 360862306a36Sopenharmony_ci 360962306a36Sopenharmony_ci excess = soft_limit_excess(mz->memcg); 361062306a36Sopenharmony_ci /* 361162306a36Sopenharmony_ci * One school of thought says that we should not add 361262306a36Sopenharmony_ci * back the node to the tree if reclaim returns 0. 361362306a36Sopenharmony_ci * But our reclaim could return 0, simply because due 361462306a36Sopenharmony_ci * to priority we are exposing a smaller subset of 361562306a36Sopenharmony_ci * memory to reclaim from. Consider this as a longer 361662306a36Sopenharmony_ci * term TODO. 361762306a36Sopenharmony_ci */ 361862306a36Sopenharmony_ci /* If excess == 0, no tree ops */ 361962306a36Sopenharmony_ci __mem_cgroup_insert_exceeded(mz, mctz, excess); 362062306a36Sopenharmony_ci spin_unlock_irq(&mctz->lock); 362162306a36Sopenharmony_ci css_put(&mz->memcg->css); 362262306a36Sopenharmony_ci loop++; 362362306a36Sopenharmony_ci /* 362462306a36Sopenharmony_ci * Could not reclaim anything and there are no more 362562306a36Sopenharmony_ci * mem cgroups to try or we seem to be looping without 362662306a36Sopenharmony_ci * reclaiming anything. 362762306a36Sopenharmony_ci */ 362862306a36Sopenharmony_ci if (!nr_reclaimed && 362962306a36Sopenharmony_ci (next_mz == NULL || 363062306a36Sopenharmony_ci loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 363162306a36Sopenharmony_ci break; 363262306a36Sopenharmony_ci } while (!nr_reclaimed); 363362306a36Sopenharmony_ci if (next_mz) 363462306a36Sopenharmony_ci css_put(&next_mz->memcg->css); 363562306a36Sopenharmony_ci return nr_reclaimed; 363662306a36Sopenharmony_ci} 363762306a36Sopenharmony_ci 363862306a36Sopenharmony_ci/* 363962306a36Sopenharmony_ci * Reclaims as many pages from the given memcg as possible. 364062306a36Sopenharmony_ci * 364162306a36Sopenharmony_ci * Caller is responsible for holding css reference for memcg. 364262306a36Sopenharmony_ci */ 364362306a36Sopenharmony_cistatic int mem_cgroup_force_empty(struct mem_cgroup *memcg) 364462306a36Sopenharmony_ci{ 364562306a36Sopenharmony_ci int nr_retries = MAX_RECLAIM_RETRIES; 364662306a36Sopenharmony_ci 364762306a36Sopenharmony_ci /* we call try-to-free pages for make this cgroup empty */ 364862306a36Sopenharmony_ci lru_add_drain_all(); 364962306a36Sopenharmony_ci 365062306a36Sopenharmony_ci drain_all_stock(memcg); 365162306a36Sopenharmony_ci 365262306a36Sopenharmony_ci /* try to free all pages in this cgroup */ 365362306a36Sopenharmony_ci while (nr_retries && page_counter_read(&memcg->memory)) { 365462306a36Sopenharmony_ci if (signal_pending(current)) 365562306a36Sopenharmony_ci return -EINTR; 365662306a36Sopenharmony_ci 365762306a36Sopenharmony_ci if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 365862306a36Sopenharmony_ci MEMCG_RECLAIM_MAY_SWAP)) 365962306a36Sopenharmony_ci nr_retries--; 366062306a36Sopenharmony_ci } 366162306a36Sopenharmony_ci 366262306a36Sopenharmony_ci return 0; 366362306a36Sopenharmony_ci} 366462306a36Sopenharmony_ci 366562306a36Sopenharmony_cistatic ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 366662306a36Sopenharmony_ci char *buf, size_t nbytes, 366762306a36Sopenharmony_ci loff_t off) 366862306a36Sopenharmony_ci{ 366962306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 367062306a36Sopenharmony_ci 367162306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 367262306a36Sopenharmony_ci return -EINVAL; 367362306a36Sopenharmony_ci return mem_cgroup_force_empty(memcg) ?: nbytes; 367462306a36Sopenharmony_ci} 367562306a36Sopenharmony_ci 367662306a36Sopenharmony_cistatic u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 367762306a36Sopenharmony_ci struct cftype *cft) 367862306a36Sopenharmony_ci{ 367962306a36Sopenharmony_ci return 1; 368062306a36Sopenharmony_ci} 368162306a36Sopenharmony_ci 368262306a36Sopenharmony_cistatic int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 368362306a36Sopenharmony_ci struct cftype *cft, u64 val) 368462306a36Sopenharmony_ci{ 368562306a36Sopenharmony_ci if (val == 1) 368662306a36Sopenharmony_ci return 0; 368762306a36Sopenharmony_ci 368862306a36Sopenharmony_ci pr_warn_once("Non-hierarchical mode is deprecated. " 368962306a36Sopenharmony_ci "Please report your usecase to linux-mm@kvack.org if you " 369062306a36Sopenharmony_ci "depend on this functionality.\n"); 369162306a36Sopenharmony_ci 369262306a36Sopenharmony_ci return -EINVAL; 369362306a36Sopenharmony_ci} 369462306a36Sopenharmony_ci 369562306a36Sopenharmony_cistatic unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 369662306a36Sopenharmony_ci{ 369762306a36Sopenharmony_ci unsigned long val; 369862306a36Sopenharmony_ci 369962306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg)) { 370062306a36Sopenharmony_ci /* 370162306a36Sopenharmony_ci * Approximate root's usage from global state. This isn't 370262306a36Sopenharmony_ci * perfect, but the root usage was always an approximation. 370362306a36Sopenharmony_ci */ 370462306a36Sopenharmony_ci val = global_node_page_state(NR_FILE_PAGES) + 370562306a36Sopenharmony_ci global_node_page_state(NR_ANON_MAPPED); 370662306a36Sopenharmony_ci if (swap) 370762306a36Sopenharmony_ci val += total_swap_pages - get_nr_swap_pages(); 370862306a36Sopenharmony_ci } else { 370962306a36Sopenharmony_ci if (!swap) 371062306a36Sopenharmony_ci val = page_counter_read(&memcg->memory); 371162306a36Sopenharmony_ci else 371262306a36Sopenharmony_ci val = page_counter_read(&memcg->memsw); 371362306a36Sopenharmony_ci } 371462306a36Sopenharmony_ci return val; 371562306a36Sopenharmony_ci} 371662306a36Sopenharmony_ci 371762306a36Sopenharmony_cienum { 371862306a36Sopenharmony_ci RES_USAGE, 371962306a36Sopenharmony_ci RES_LIMIT, 372062306a36Sopenharmony_ci RES_MAX_USAGE, 372162306a36Sopenharmony_ci RES_FAILCNT, 372262306a36Sopenharmony_ci RES_SOFT_LIMIT, 372362306a36Sopenharmony_ci}; 372462306a36Sopenharmony_ci 372562306a36Sopenharmony_cistatic u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 372662306a36Sopenharmony_ci struct cftype *cft) 372762306a36Sopenharmony_ci{ 372862306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 372962306a36Sopenharmony_ci struct page_counter *counter; 373062306a36Sopenharmony_ci 373162306a36Sopenharmony_ci switch (MEMFILE_TYPE(cft->private)) { 373262306a36Sopenharmony_ci case _MEM: 373362306a36Sopenharmony_ci counter = &memcg->memory; 373462306a36Sopenharmony_ci break; 373562306a36Sopenharmony_ci case _MEMSWAP: 373662306a36Sopenharmony_ci counter = &memcg->memsw; 373762306a36Sopenharmony_ci break; 373862306a36Sopenharmony_ci case _KMEM: 373962306a36Sopenharmony_ci counter = &memcg->kmem; 374062306a36Sopenharmony_ci break; 374162306a36Sopenharmony_ci case _TCP: 374262306a36Sopenharmony_ci counter = &memcg->tcpmem; 374362306a36Sopenharmony_ci break; 374462306a36Sopenharmony_ci default: 374562306a36Sopenharmony_ci BUG(); 374662306a36Sopenharmony_ci } 374762306a36Sopenharmony_ci 374862306a36Sopenharmony_ci switch (MEMFILE_ATTR(cft->private)) { 374962306a36Sopenharmony_ci case RES_USAGE: 375062306a36Sopenharmony_ci if (counter == &memcg->memory) 375162306a36Sopenharmony_ci return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 375262306a36Sopenharmony_ci if (counter == &memcg->memsw) 375362306a36Sopenharmony_ci return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 375462306a36Sopenharmony_ci return (u64)page_counter_read(counter) * PAGE_SIZE; 375562306a36Sopenharmony_ci case RES_LIMIT: 375662306a36Sopenharmony_ci return (u64)counter->max * PAGE_SIZE; 375762306a36Sopenharmony_ci case RES_MAX_USAGE: 375862306a36Sopenharmony_ci return (u64)counter->watermark * PAGE_SIZE; 375962306a36Sopenharmony_ci case RES_FAILCNT: 376062306a36Sopenharmony_ci return counter->failcnt; 376162306a36Sopenharmony_ci case RES_SOFT_LIMIT: 376262306a36Sopenharmony_ci return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 376362306a36Sopenharmony_ci default: 376462306a36Sopenharmony_ci BUG(); 376562306a36Sopenharmony_ci } 376662306a36Sopenharmony_ci} 376762306a36Sopenharmony_ci 376862306a36Sopenharmony_ci/* 376962306a36Sopenharmony_ci * This function doesn't do anything useful. Its only job is to provide a read 377062306a36Sopenharmony_ci * handler for a file so that cgroup_file_mode() will add read permissions. 377162306a36Sopenharmony_ci */ 377262306a36Sopenharmony_cistatic int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 377362306a36Sopenharmony_ci __always_unused void *v) 377462306a36Sopenharmony_ci{ 377562306a36Sopenharmony_ci return -EINVAL; 377662306a36Sopenharmony_ci} 377762306a36Sopenharmony_ci 377862306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 377962306a36Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg) 378062306a36Sopenharmony_ci{ 378162306a36Sopenharmony_ci struct obj_cgroup *objcg; 378262306a36Sopenharmony_ci 378362306a36Sopenharmony_ci if (mem_cgroup_kmem_disabled()) 378462306a36Sopenharmony_ci return 0; 378562306a36Sopenharmony_ci 378662306a36Sopenharmony_ci if (unlikely(mem_cgroup_is_root(memcg))) 378762306a36Sopenharmony_ci return 0; 378862306a36Sopenharmony_ci 378962306a36Sopenharmony_ci objcg = obj_cgroup_alloc(); 379062306a36Sopenharmony_ci if (!objcg) 379162306a36Sopenharmony_ci return -ENOMEM; 379262306a36Sopenharmony_ci 379362306a36Sopenharmony_ci objcg->memcg = memcg; 379462306a36Sopenharmony_ci rcu_assign_pointer(memcg->objcg, objcg); 379562306a36Sopenharmony_ci 379662306a36Sopenharmony_ci static_branch_enable(&memcg_kmem_online_key); 379762306a36Sopenharmony_ci 379862306a36Sopenharmony_ci memcg->kmemcg_id = memcg->id.id; 379962306a36Sopenharmony_ci 380062306a36Sopenharmony_ci return 0; 380162306a36Sopenharmony_ci} 380262306a36Sopenharmony_ci 380362306a36Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg) 380462306a36Sopenharmony_ci{ 380562306a36Sopenharmony_ci struct mem_cgroup *parent; 380662306a36Sopenharmony_ci 380762306a36Sopenharmony_ci if (mem_cgroup_kmem_disabled()) 380862306a36Sopenharmony_ci return; 380962306a36Sopenharmony_ci 381062306a36Sopenharmony_ci if (unlikely(mem_cgroup_is_root(memcg))) 381162306a36Sopenharmony_ci return; 381262306a36Sopenharmony_ci 381362306a36Sopenharmony_ci parent = parent_mem_cgroup(memcg); 381462306a36Sopenharmony_ci if (!parent) 381562306a36Sopenharmony_ci parent = root_mem_cgroup; 381662306a36Sopenharmony_ci 381762306a36Sopenharmony_ci memcg_reparent_objcgs(memcg, parent); 381862306a36Sopenharmony_ci 381962306a36Sopenharmony_ci /* 382062306a36Sopenharmony_ci * After we have finished memcg_reparent_objcgs(), all list_lrus 382162306a36Sopenharmony_ci * corresponding to this cgroup are guaranteed to remain empty. 382262306a36Sopenharmony_ci * The ordering is imposed by list_lru_node->lock taken by 382362306a36Sopenharmony_ci * memcg_reparent_list_lrus(). 382462306a36Sopenharmony_ci */ 382562306a36Sopenharmony_ci memcg_reparent_list_lrus(memcg, parent); 382662306a36Sopenharmony_ci} 382762306a36Sopenharmony_ci#else 382862306a36Sopenharmony_cistatic int memcg_online_kmem(struct mem_cgroup *memcg) 382962306a36Sopenharmony_ci{ 383062306a36Sopenharmony_ci return 0; 383162306a36Sopenharmony_ci} 383262306a36Sopenharmony_cistatic void memcg_offline_kmem(struct mem_cgroup *memcg) 383362306a36Sopenharmony_ci{ 383462306a36Sopenharmony_ci} 383562306a36Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM */ 383662306a36Sopenharmony_ci 383762306a36Sopenharmony_cistatic int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 383862306a36Sopenharmony_ci{ 383962306a36Sopenharmony_ci int ret; 384062306a36Sopenharmony_ci 384162306a36Sopenharmony_ci mutex_lock(&memcg_max_mutex); 384262306a36Sopenharmony_ci 384362306a36Sopenharmony_ci ret = page_counter_set_max(&memcg->tcpmem, max); 384462306a36Sopenharmony_ci if (ret) 384562306a36Sopenharmony_ci goto out; 384662306a36Sopenharmony_ci 384762306a36Sopenharmony_ci if (!memcg->tcpmem_active) { 384862306a36Sopenharmony_ci /* 384962306a36Sopenharmony_ci * The active flag needs to be written after the static_key 385062306a36Sopenharmony_ci * update. This is what guarantees that the socket activation 385162306a36Sopenharmony_ci * function is the last one to run. See mem_cgroup_sk_alloc() 385262306a36Sopenharmony_ci * for details, and note that we don't mark any socket as 385362306a36Sopenharmony_ci * belonging to this memcg until that flag is up. 385462306a36Sopenharmony_ci * 385562306a36Sopenharmony_ci * We need to do this, because static_keys will span multiple 385662306a36Sopenharmony_ci * sites, but we can't control their order. If we mark a socket 385762306a36Sopenharmony_ci * as accounted, but the accounting functions are not patched in 385862306a36Sopenharmony_ci * yet, we'll lose accounting. 385962306a36Sopenharmony_ci * 386062306a36Sopenharmony_ci * We never race with the readers in mem_cgroup_sk_alloc(), 386162306a36Sopenharmony_ci * because when this value change, the code to process it is not 386262306a36Sopenharmony_ci * patched in yet. 386362306a36Sopenharmony_ci */ 386462306a36Sopenharmony_ci static_branch_inc(&memcg_sockets_enabled_key); 386562306a36Sopenharmony_ci memcg->tcpmem_active = true; 386662306a36Sopenharmony_ci } 386762306a36Sopenharmony_ciout: 386862306a36Sopenharmony_ci mutex_unlock(&memcg_max_mutex); 386962306a36Sopenharmony_ci return ret; 387062306a36Sopenharmony_ci} 387162306a36Sopenharmony_ci 387262306a36Sopenharmony_ci/* 387362306a36Sopenharmony_ci * The user of this function is... 387462306a36Sopenharmony_ci * RES_LIMIT. 387562306a36Sopenharmony_ci */ 387662306a36Sopenharmony_cistatic ssize_t mem_cgroup_write(struct kernfs_open_file *of, 387762306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 387862306a36Sopenharmony_ci{ 387962306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 388062306a36Sopenharmony_ci unsigned long nr_pages; 388162306a36Sopenharmony_ci int ret; 388262306a36Sopenharmony_ci 388362306a36Sopenharmony_ci buf = strstrip(buf); 388462306a36Sopenharmony_ci ret = page_counter_memparse(buf, "-1", &nr_pages); 388562306a36Sopenharmony_ci if (ret) 388662306a36Sopenharmony_ci return ret; 388762306a36Sopenharmony_ci 388862306a36Sopenharmony_ci switch (MEMFILE_ATTR(of_cft(of)->private)) { 388962306a36Sopenharmony_ci case RES_LIMIT: 389062306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 389162306a36Sopenharmony_ci ret = -EINVAL; 389262306a36Sopenharmony_ci break; 389362306a36Sopenharmony_ci } 389462306a36Sopenharmony_ci switch (MEMFILE_TYPE(of_cft(of)->private)) { 389562306a36Sopenharmony_ci case _MEM: 389662306a36Sopenharmony_ci ret = mem_cgroup_resize_max(memcg, nr_pages, false); 389762306a36Sopenharmony_ci break; 389862306a36Sopenharmony_ci case _MEMSWAP: 389962306a36Sopenharmony_ci ret = mem_cgroup_resize_max(memcg, nr_pages, true); 390062306a36Sopenharmony_ci break; 390162306a36Sopenharmony_ci case _KMEM: 390262306a36Sopenharmony_ci pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 390362306a36Sopenharmony_ci "Writing any value to this file has no effect. " 390462306a36Sopenharmony_ci "Please report your usecase to linux-mm@kvack.org if you " 390562306a36Sopenharmony_ci "depend on this functionality.\n"); 390662306a36Sopenharmony_ci ret = 0; 390762306a36Sopenharmony_ci break; 390862306a36Sopenharmony_ci case _TCP: 390962306a36Sopenharmony_ci ret = memcg_update_tcp_max(memcg, nr_pages); 391062306a36Sopenharmony_ci break; 391162306a36Sopenharmony_ci } 391262306a36Sopenharmony_ci break; 391362306a36Sopenharmony_ci case RES_SOFT_LIMIT: 391462306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 391562306a36Sopenharmony_ci ret = -EOPNOTSUPP; 391662306a36Sopenharmony_ci } else { 391762306a36Sopenharmony_ci WRITE_ONCE(memcg->soft_limit, nr_pages); 391862306a36Sopenharmony_ci ret = 0; 391962306a36Sopenharmony_ci } 392062306a36Sopenharmony_ci break; 392162306a36Sopenharmony_ci } 392262306a36Sopenharmony_ci return ret ?: nbytes; 392362306a36Sopenharmony_ci} 392462306a36Sopenharmony_ci 392562306a36Sopenharmony_cistatic ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 392662306a36Sopenharmony_ci size_t nbytes, loff_t off) 392762306a36Sopenharmony_ci{ 392862306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 392962306a36Sopenharmony_ci struct page_counter *counter; 393062306a36Sopenharmony_ci 393162306a36Sopenharmony_ci switch (MEMFILE_TYPE(of_cft(of)->private)) { 393262306a36Sopenharmony_ci case _MEM: 393362306a36Sopenharmony_ci counter = &memcg->memory; 393462306a36Sopenharmony_ci break; 393562306a36Sopenharmony_ci case _MEMSWAP: 393662306a36Sopenharmony_ci counter = &memcg->memsw; 393762306a36Sopenharmony_ci break; 393862306a36Sopenharmony_ci case _KMEM: 393962306a36Sopenharmony_ci counter = &memcg->kmem; 394062306a36Sopenharmony_ci break; 394162306a36Sopenharmony_ci case _TCP: 394262306a36Sopenharmony_ci counter = &memcg->tcpmem; 394362306a36Sopenharmony_ci break; 394462306a36Sopenharmony_ci default: 394562306a36Sopenharmony_ci BUG(); 394662306a36Sopenharmony_ci } 394762306a36Sopenharmony_ci 394862306a36Sopenharmony_ci switch (MEMFILE_ATTR(of_cft(of)->private)) { 394962306a36Sopenharmony_ci case RES_MAX_USAGE: 395062306a36Sopenharmony_ci page_counter_reset_watermark(counter); 395162306a36Sopenharmony_ci break; 395262306a36Sopenharmony_ci case RES_FAILCNT: 395362306a36Sopenharmony_ci counter->failcnt = 0; 395462306a36Sopenharmony_ci break; 395562306a36Sopenharmony_ci default: 395662306a36Sopenharmony_ci BUG(); 395762306a36Sopenharmony_ci } 395862306a36Sopenharmony_ci 395962306a36Sopenharmony_ci return nbytes; 396062306a36Sopenharmony_ci} 396162306a36Sopenharmony_ci 396262306a36Sopenharmony_cistatic u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 396362306a36Sopenharmony_ci struct cftype *cft) 396462306a36Sopenharmony_ci{ 396562306a36Sopenharmony_ci return mem_cgroup_from_css(css)->move_charge_at_immigrate; 396662306a36Sopenharmony_ci} 396762306a36Sopenharmony_ci 396862306a36Sopenharmony_ci#ifdef CONFIG_MMU 396962306a36Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 397062306a36Sopenharmony_ci struct cftype *cft, u64 val) 397162306a36Sopenharmony_ci{ 397262306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 397362306a36Sopenharmony_ci 397462306a36Sopenharmony_ci pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 397562306a36Sopenharmony_ci "Please report your usecase to linux-mm@kvack.org if you " 397662306a36Sopenharmony_ci "depend on this functionality.\n"); 397762306a36Sopenharmony_ci 397862306a36Sopenharmony_ci if (val & ~MOVE_MASK) 397962306a36Sopenharmony_ci return -EINVAL; 398062306a36Sopenharmony_ci 398162306a36Sopenharmony_ci /* 398262306a36Sopenharmony_ci * No kind of locking is needed in here, because ->can_attach() will 398362306a36Sopenharmony_ci * check this value once in the beginning of the process, and then carry 398462306a36Sopenharmony_ci * on with stale data. This means that changes to this value will only 398562306a36Sopenharmony_ci * affect task migrations starting after the change. 398662306a36Sopenharmony_ci */ 398762306a36Sopenharmony_ci memcg->move_charge_at_immigrate = val; 398862306a36Sopenharmony_ci return 0; 398962306a36Sopenharmony_ci} 399062306a36Sopenharmony_ci#else 399162306a36Sopenharmony_cistatic int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 399262306a36Sopenharmony_ci struct cftype *cft, u64 val) 399362306a36Sopenharmony_ci{ 399462306a36Sopenharmony_ci return -ENOSYS; 399562306a36Sopenharmony_ci} 399662306a36Sopenharmony_ci#endif 399762306a36Sopenharmony_ci 399862306a36Sopenharmony_ci#ifdef CONFIG_NUMA 399962306a36Sopenharmony_ci 400062306a36Sopenharmony_ci#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 400162306a36Sopenharmony_ci#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 400262306a36Sopenharmony_ci#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 400362306a36Sopenharmony_ci 400462306a36Sopenharmony_cistatic unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 400562306a36Sopenharmony_ci int nid, unsigned int lru_mask, bool tree) 400662306a36Sopenharmony_ci{ 400762306a36Sopenharmony_ci struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 400862306a36Sopenharmony_ci unsigned long nr = 0; 400962306a36Sopenharmony_ci enum lru_list lru; 401062306a36Sopenharmony_ci 401162306a36Sopenharmony_ci VM_BUG_ON((unsigned)nid >= nr_node_ids); 401262306a36Sopenharmony_ci 401362306a36Sopenharmony_ci for_each_lru(lru) { 401462306a36Sopenharmony_ci if (!(BIT(lru) & lru_mask)) 401562306a36Sopenharmony_ci continue; 401662306a36Sopenharmony_ci if (tree) 401762306a36Sopenharmony_ci nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 401862306a36Sopenharmony_ci else 401962306a36Sopenharmony_ci nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 402062306a36Sopenharmony_ci } 402162306a36Sopenharmony_ci return nr; 402262306a36Sopenharmony_ci} 402362306a36Sopenharmony_ci 402462306a36Sopenharmony_cistatic unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 402562306a36Sopenharmony_ci unsigned int lru_mask, 402662306a36Sopenharmony_ci bool tree) 402762306a36Sopenharmony_ci{ 402862306a36Sopenharmony_ci unsigned long nr = 0; 402962306a36Sopenharmony_ci enum lru_list lru; 403062306a36Sopenharmony_ci 403162306a36Sopenharmony_ci for_each_lru(lru) { 403262306a36Sopenharmony_ci if (!(BIT(lru) & lru_mask)) 403362306a36Sopenharmony_ci continue; 403462306a36Sopenharmony_ci if (tree) 403562306a36Sopenharmony_ci nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 403662306a36Sopenharmony_ci else 403762306a36Sopenharmony_ci nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 403862306a36Sopenharmony_ci } 403962306a36Sopenharmony_ci return nr; 404062306a36Sopenharmony_ci} 404162306a36Sopenharmony_ci 404262306a36Sopenharmony_cistatic int memcg_numa_stat_show(struct seq_file *m, void *v) 404362306a36Sopenharmony_ci{ 404462306a36Sopenharmony_ci struct numa_stat { 404562306a36Sopenharmony_ci const char *name; 404662306a36Sopenharmony_ci unsigned int lru_mask; 404762306a36Sopenharmony_ci }; 404862306a36Sopenharmony_ci 404962306a36Sopenharmony_ci static const struct numa_stat stats[] = { 405062306a36Sopenharmony_ci { "total", LRU_ALL }, 405162306a36Sopenharmony_ci { "file", LRU_ALL_FILE }, 405262306a36Sopenharmony_ci { "anon", LRU_ALL_ANON }, 405362306a36Sopenharmony_ci { "unevictable", BIT(LRU_UNEVICTABLE) }, 405462306a36Sopenharmony_ci }; 405562306a36Sopenharmony_ci const struct numa_stat *stat; 405662306a36Sopenharmony_ci int nid; 405762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 405862306a36Sopenharmony_ci 405962306a36Sopenharmony_ci mem_cgroup_flush_stats(); 406062306a36Sopenharmony_ci 406162306a36Sopenharmony_ci for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 406262306a36Sopenharmony_ci seq_printf(m, "%s=%lu", stat->name, 406362306a36Sopenharmony_ci mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 406462306a36Sopenharmony_ci false)); 406562306a36Sopenharmony_ci for_each_node_state(nid, N_MEMORY) 406662306a36Sopenharmony_ci seq_printf(m, " N%d=%lu", nid, 406762306a36Sopenharmony_ci mem_cgroup_node_nr_lru_pages(memcg, nid, 406862306a36Sopenharmony_ci stat->lru_mask, false)); 406962306a36Sopenharmony_ci seq_putc(m, '\n'); 407062306a36Sopenharmony_ci } 407162306a36Sopenharmony_ci 407262306a36Sopenharmony_ci for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 407362306a36Sopenharmony_ci 407462306a36Sopenharmony_ci seq_printf(m, "hierarchical_%s=%lu", stat->name, 407562306a36Sopenharmony_ci mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 407662306a36Sopenharmony_ci true)); 407762306a36Sopenharmony_ci for_each_node_state(nid, N_MEMORY) 407862306a36Sopenharmony_ci seq_printf(m, " N%d=%lu", nid, 407962306a36Sopenharmony_ci mem_cgroup_node_nr_lru_pages(memcg, nid, 408062306a36Sopenharmony_ci stat->lru_mask, true)); 408162306a36Sopenharmony_ci seq_putc(m, '\n'); 408262306a36Sopenharmony_ci } 408362306a36Sopenharmony_ci 408462306a36Sopenharmony_ci return 0; 408562306a36Sopenharmony_ci} 408662306a36Sopenharmony_ci#endif /* CONFIG_NUMA */ 408762306a36Sopenharmony_ci 408862306a36Sopenharmony_cistatic const unsigned int memcg1_stats[] = { 408962306a36Sopenharmony_ci NR_FILE_PAGES, 409062306a36Sopenharmony_ci NR_ANON_MAPPED, 409162306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 409262306a36Sopenharmony_ci NR_ANON_THPS, 409362306a36Sopenharmony_ci#endif 409462306a36Sopenharmony_ci NR_SHMEM, 409562306a36Sopenharmony_ci NR_FILE_MAPPED, 409662306a36Sopenharmony_ci NR_FILE_DIRTY, 409762306a36Sopenharmony_ci NR_WRITEBACK, 409862306a36Sopenharmony_ci WORKINGSET_REFAULT_ANON, 409962306a36Sopenharmony_ci WORKINGSET_REFAULT_FILE, 410062306a36Sopenharmony_ci MEMCG_SWAP, 410162306a36Sopenharmony_ci}; 410262306a36Sopenharmony_ci 410362306a36Sopenharmony_cistatic const char *const memcg1_stat_names[] = { 410462306a36Sopenharmony_ci "cache", 410562306a36Sopenharmony_ci "rss", 410662306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 410762306a36Sopenharmony_ci "rss_huge", 410862306a36Sopenharmony_ci#endif 410962306a36Sopenharmony_ci "shmem", 411062306a36Sopenharmony_ci "mapped_file", 411162306a36Sopenharmony_ci "dirty", 411262306a36Sopenharmony_ci "writeback", 411362306a36Sopenharmony_ci "workingset_refault_anon", 411462306a36Sopenharmony_ci "workingset_refault_file", 411562306a36Sopenharmony_ci "swap", 411662306a36Sopenharmony_ci}; 411762306a36Sopenharmony_ci 411862306a36Sopenharmony_ci/* Universal VM events cgroup1 shows, original sort order */ 411962306a36Sopenharmony_cistatic const unsigned int memcg1_events[] = { 412062306a36Sopenharmony_ci PGPGIN, 412162306a36Sopenharmony_ci PGPGOUT, 412262306a36Sopenharmony_ci PGFAULT, 412362306a36Sopenharmony_ci PGMAJFAULT, 412462306a36Sopenharmony_ci}; 412562306a36Sopenharmony_ci 412662306a36Sopenharmony_cistatic void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 412762306a36Sopenharmony_ci{ 412862306a36Sopenharmony_ci unsigned long memory, memsw; 412962306a36Sopenharmony_ci struct mem_cgroup *mi; 413062306a36Sopenharmony_ci unsigned int i; 413162306a36Sopenharmony_ci 413262306a36Sopenharmony_ci BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 413362306a36Sopenharmony_ci 413462306a36Sopenharmony_ci mem_cgroup_flush_stats(); 413562306a36Sopenharmony_ci 413662306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 413762306a36Sopenharmony_ci unsigned long nr; 413862306a36Sopenharmony_ci 413962306a36Sopenharmony_ci if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 414062306a36Sopenharmony_ci continue; 414162306a36Sopenharmony_ci nr = memcg_page_state_local(memcg, memcg1_stats[i]); 414262306a36Sopenharmony_ci seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], 414362306a36Sopenharmony_ci nr * memcg_page_state_unit(memcg1_stats[i])); 414462306a36Sopenharmony_ci } 414562306a36Sopenharmony_ci 414662306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 414762306a36Sopenharmony_ci seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 414862306a36Sopenharmony_ci memcg_events_local(memcg, memcg1_events[i])); 414962306a36Sopenharmony_ci 415062306a36Sopenharmony_ci for (i = 0; i < NR_LRU_LISTS; i++) 415162306a36Sopenharmony_ci seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 415262306a36Sopenharmony_ci memcg_page_state_local(memcg, NR_LRU_BASE + i) * 415362306a36Sopenharmony_ci PAGE_SIZE); 415462306a36Sopenharmony_ci 415562306a36Sopenharmony_ci /* Hierarchical information */ 415662306a36Sopenharmony_ci memory = memsw = PAGE_COUNTER_MAX; 415762306a36Sopenharmony_ci for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 415862306a36Sopenharmony_ci memory = min(memory, READ_ONCE(mi->memory.max)); 415962306a36Sopenharmony_ci memsw = min(memsw, READ_ONCE(mi->memsw.max)); 416062306a36Sopenharmony_ci } 416162306a36Sopenharmony_ci seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 416262306a36Sopenharmony_ci (u64)memory * PAGE_SIZE); 416362306a36Sopenharmony_ci if (do_memsw_account()) 416462306a36Sopenharmony_ci seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 416562306a36Sopenharmony_ci (u64)memsw * PAGE_SIZE); 416662306a36Sopenharmony_ci 416762306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 416862306a36Sopenharmony_ci unsigned long nr; 416962306a36Sopenharmony_ci 417062306a36Sopenharmony_ci if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 417162306a36Sopenharmony_ci continue; 417262306a36Sopenharmony_ci nr = memcg_page_state(memcg, memcg1_stats[i]); 417362306a36Sopenharmony_ci seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 417462306a36Sopenharmony_ci (u64)nr * memcg_page_state_unit(memcg1_stats[i])); 417562306a36Sopenharmony_ci } 417662306a36Sopenharmony_ci 417762306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 417862306a36Sopenharmony_ci seq_buf_printf(s, "total_%s %llu\n", 417962306a36Sopenharmony_ci vm_event_name(memcg1_events[i]), 418062306a36Sopenharmony_ci (u64)memcg_events(memcg, memcg1_events[i])); 418162306a36Sopenharmony_ci 418262306a36Sopenharmony_ci for (i = 0; i < NR_LRU_LISTS; i++) 418362306a36Sopenharmony_ci seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 418462306a36Sopenharmony_ci (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 418562306a36Sopenharmony_ci PAGE_SIZE); 418662306a36Sopenharmony_ci 418762306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 418862306a36Sopenharmony_ci { 418962306a36Sopenharmony_ci pg_data_t *pgdat; 419062306a36Sopenharmony_ci struct mem_cgroup_per_node *mz; 419162306a36Sopenharmony_ci unsigned long anon_cost = 0; 419262306a36Sopenharmony_ci unsigned long file_cost = 0; 419362306a36Sopenharmony_ci 419462306a36Sopenharmony_ci for_each_online_pgdat(pgdat) { 419562306a36Sopenharmony_ci mz = memcg->nodeinfo[pgdat->node_id]; 419662306a36Sopenharmony_ci 419762306a36Sopenharmony_ci anon_cost += mz->lruvec.anon_cost; 419862306a36Sopenharmony_ci file_cost += mz->lruvec.file_cost; 419962306a36Sopenharmony_ci } 420062306a36Sopenharmony_ci seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 420162306a36Sopenharmony_ci seq_buf_printf(s, "file_cost %lu\n", file_cost); 420262306a36Sopenharmony_ci } 420362306a36Sopenharmony_ci#endif 420462306a36Sopenharmony_ci} 420562306a36Sopenharmony_ci 420662306a36Sopenharmony_cistatic u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 420762306a36Sopenharmony_ci struct cftype *cft) 420862306a36Sopenharmony_ci{ 420962306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 421062306a36Sopenharmony_ci 421162306a36Sopenharmony_ci return mem_cgroup_swappiness(memcg); 421262306a36Sopenharmony_ci} 421362306a36Sopenharmony_ci 421462306a36Sopenharmony_cistatic int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 421562306a36Sopenharmony_ci struct cftype *cft, u64 val) 421662306a36Sopenharmony_ci{ 421762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 421862306a36Sopenharmony_ci 421962306a36Sopenharmony_ci if (val > 200) 422062306a36Sopenharmony_ci return -EINVAL; 422162306a36Sopenharmony_ci 422262306a36Sopenharmony_ci if (!mem_cgroup_is_root(memcg)) 422362306a36Sopenharmony_ci WRITE_ONCE(memcg->swappiness, val); 422462306a36Sopenharmony_ci else 422562306a36Sopenharmony_ci WRITE_ONCE(vm_swappiness, val); 422662306a36Sopenharmony_ci 422762306a36Sopenharmony_ci return 0; 422862306a36Sopenharmony_ci} 422962306a36Sopenharmony_ci 423062306a36Sopenharmony_cistatic void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 423162306a36Sopenharmony_ci{ 423262306a36Sopenharmony_ci struct mem_cgroup_threshold_ary *t; 423362306a36Sopenharmony_ci unsigned long usage; 423462306a36Sopenharmony_ci int i; 423562306a36Sopenharmony_ci 423662306a36Sopenharmony_ci rcu_read_lock(); 423762306a36Sopenharmony_ci if (!swap) 423862306a36Sopenharmony_ci t = rcu_dereference(memcg->thresholds.primary); 423962306a36Sopenharmony_ci else 424062306a36Sopenharmony_ci t = rcu_dereference(memcg->memsw_thresholds.primary); 424162306a36Sopenharmony_ci 424262306a36Sopenharmony_ci if (!t) 424362306a36Sopenharmony_ci goto unlock; 424462306a36Sopenharmony_ci 424562306a36Sopenharmony_ci usage = mem_cgroup_usage(memcg, swap); 424662306a36Sopenharmony_ci 424762306a36Sopenharmony_ci /* 424862306a36Sopenharmony_ci * current_threshold points to threshold just below or equal to usage. 424962306a36Sopenharmony_ci * If it's not true, a threshold was crossed after last 425062306a36Sopenharmony_ci * call of __mem_cgroup_threshold(). 425162306a36Sopenharmony_ci */ 425262306a36Sopenharmony_ci i = t->current_threshold; 425362306a36Sopenharmony_ci 425462306a36Sopenharmony_ci /* 425562306a36Sopenharmony_ci * Iterate backward over array of thresholds starting from 425662306a36Sopenharmony_ci * current_threshold and check if a threshold is crossed. 425762306a36Sopenharmony_ci * If none of thresholds below usage is crossed, we read 425862306a36Sopenharmony_ci * only one element of the array here. 425962306a36Sopenharmony_ci */ 426062306a36Sopenharmony_ci for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 426162306a36Sopenharmony_ci eventfd_signal(t->entries[i].eventfd, 1); 426262306a36Sopenharmony_ci 426362306a36Sopenharmony_ci /* i = current_threshold + 1 */ 426462306a36Sopenharmony_ci i++; 426562306a36Sopenharmony_ci 426662306a36Sopenharmony_ci /* 426762306a36Sopenharmony_ci * Iterate forward over array of thresholds starting from 426862306a36Sopenharmony_ci * current_threshold+1 and check if a threshold is crossed. 426962306a36Sopenharmony_ci * If none of thresholds above usage is crossed, we read 427062306a36Sopenharmony_ci * only one element of the array here. 427162306a36Sopenharmony_ci */ 427262306a36Sopenharmony_ci for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 427362306a36Sopenharmony_ci eventfd_signal(t->entries[i].eventfd, 1); 427462306a36Sopenharmony_ci 427562306a36Sopenharmony_ci /* Update current_threshold */ 427662306a36Sopenharmony_ci t->current_threshold = i - 1; 427762306a36Sopenharmony_ciunlock: 427862306a36Sopenharmony_ci rcu_read_unlock(); 427962306a36Sopenharmony_ci} 428062306a36Sopenharmony_ci 428162306a36Sopenharmony_cistatic void mem_cgroup_threshold(struct mem_cgroup *memcg) 428262306a36Sopenharmony_ci{ 428362306a36Sopenharmony_ci while (memcg) { 428462306a36Sopenharmony_ci __mem_cgroup_threshold(memcg, false); 428562306a36Sopenharmony_ci if (do_memsw_account()) 428662306a36Sopenharmony_ci __mem_cgroup_threshold(memcg, true); 428762306a36Sopenharmony_ci 428862306a36Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 428962306a36Sopenharmony_ci } 429062306a36Sopenharmony_ci} 429162306a36Sopenharmony_ci 429262306a36Sopenharmony_cistatic int compare_thresholds(const void *a, const void *b) 429362306a36Sopenharmony_ci{ 429462306a36Sopenharmony_ci const struct mem_cgroup_threshold *_a = a; 429562306a36Sopenharmony_ci const struct mem_cgroup_threshold *_b = b; 429662306a36Sopenharmony_ci 429762306a36Sopenharmony_ci if (_a->threshold > _b->threshold) 429862306a36Sopenharmony_ci return 1; 429962306a36Sopenharmony_ci 430062306a36Sopenharmony_ci if (_a->threshold < _b->threshold) 430162306a36Sopenharmony_ci return -1; 430262306a36Sopenharmony_ci 430362306a36Sopenharmony_ci return 0; 430462306a36Sopenharmony_ci} 430562306a36Sopenharmony_ci 430662306a36Sopenharmony_cistatic int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 430762306a36Sopenharmony_ci{ 430862306a36Sopenharmony_ci struct mem_cgroup_eventfd_list *ev; 430962306a36Sopenharmony_ci 431062306a36Sopenharmony_ci spin_lock(&memcg_oom_lock); 431162306a36Sopenharmony_ci 431262306a36Sopenharmony_ci list_for_each_entry(ev, &memcg->oom_notify, list) 431362306a36Sopenharmony_ci eventfd_signal(ev->eventfd, 1); 431462306a36Sopenharmony_ci 431562306a36Sopenharmony_ci spin_unlock(&memcg_oom_lock); 431662306a36Sopenharmony_ci return 0; 431762306a36Sopenharmony_ci} 431862306a36Sopenharmony_ci 431962306a36Sopenharmony_cistatic void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 432062306a36Sopenharmony_ci{ 432162306a36Sopenharmony_ci struct mem_cgroup *iter; 432262306a36Sopenharmony_ci 432362306a36Sopenharmony_ci for_each_mem_cgroup_tree(iter, memcg) 432462306a36Sopenharmony_ci mem_cgroup_oom_notify_cb(iter); 432562306a36Sopenharmony_ci} 432662306a36Sopenharmony_ci 432762306a36Sopenharmony_cistatic int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 432862306a36Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args, enum res_type type) 432962306a36Sopenharmony_ci{ 433062306a36Sopenharmony_ci struct mem_cgroup_thresholds *thresholds; 433162306a36Sopenharmony_ci struct mem_cgroup_threshold_ary *new; 433262306a36Sopenharmony_ci unsigned long threshold; 433362306a36Sopenharmony_ci unsigned long usage; 433462306a36Sopenharmony_ci int i, size, ret; 433562306a36Sopenharmony_ci 433662306a36Sopenharmony_ci ret = page_counter_memparse(args, "-1", &threshold); 433762306a36Sopenharmony_ci if (ret) 433862306a36Sopenharmony_ci return ret; 433962306a36Sopenharmony_ci 434062306a36Sopenharmony_ci mutex_lock(&memcg->thresholds_lock); 434162306a36Sopenharmony_ci 434262306a36Sopenharmony_ci if (type == _MEM) { 434362306a36Sopenharmony_ci thresholds = &memcg->thresholds; 434462306a36Sopenharmony_ci usage = mem_cgroup_usage(memcg, false); 434562306a36Sopenharmony_ci } else if (type == _MEMSWAP) { 434662306a36Sopenharmony_ci thresholds = &memcg->memsw_thresholds; 434762306a36Sopenharmony_ci usage = mem_cgroup_usage(memcg, true); 434862306a36Sopenharmony_ci } else 434962306a36Sopenharmony_ci BUG(); 435062306a36Sopenharmony_ci 435162306a36Sopenharmony_ci /* Check if a threshold crossed before adding a new one */ 435262306a36Sopenharmony_ci if (thresholds->primary) 435362306a36Sopenharmony_ci __mem_cgroup_threshold(memcg, type == _MEMSWAP); 435462306a36Sopenharmony_ci 435562306a36Sopenharmony_ci size = thresholds->primary ? thresholds->primary->size + 1 : 1; 435662306a36Sopenharmony_ci 435762306a36Sopenharmony_ci /* Allocate memory for new array of thresholds */ 435862306a36Sopenharmony_ci new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 435962306a36Sopenharmony_ci if (!new) { 436062306a36Sopenharmony_ci ret = -ENOMEM; 436162306a36Sopenharmony_ci goto unlock; 436262306a36Sopenharmony_ci } 436362306a36Sopenharmony_ci new->size = size; 436462306a36Sopenharmony_ci 436562306a36Sopenharmony_ci /* Copy thresholds (if any) to new array */ 436662306a36Sopenharmony_ci if (thresholds->primary) 436762306a36Sopenharmony_ci memcpy(new->entries, thresholds->primary->entries, 436862306a36Sopenharmony_ci flex_array_size(new, entries, size - 1)); 436962306a36Sopenharmony_ci 437062306a36Sopenharmony_ci /* Add new threshold */ 437162306a36Sopenharmony_ci new->entries[size - 1].eventfd = eventfd; 437262306a36Sopenharmony_ci new->entries[size - 1].threshold = threshold; 437362306a36Sopenharmony_ci 437462306a36Sopenharmony_ci /* Sort thresholds. Registering of new threshold isn't time-critical */ 437562306a36Sopenharmony_ci sort(new->entries, size, sizeof(*new->entries), 437662306a36Sopenharmony_ci compare_thresholds, NULL); 437762306a36Sopenharmony_ci 437862306a36Sopenharmony_ci /* Find current threshold */ 437962306a36Sopenharmony_ci new->current_threshold = -1; 438062306a36Sopenharmony_ci for (i = 0; i < size; i++) { 438162306a36Sopenharmony_ci if (new->entries[i].threshold <= usage) { 438262306a36Sopenharmony_ci /* 438362306a36Sopenharmony_ci * new->current_threshold will not be used until 438462306a36Sopenharmony_ci * rcu_assign_pointer(), so it's safe to increment 438562306a36Sopenharmony_ci * it here. 438662306a36Sopenharmony_ci */ 438762306a36Sopenharmony_ci ++new->current_threshold; 438862306a36Sopenharmony_ci } else 438962306a36Sopenharmony_ci break; 439062306a36Sopenharmony_ci } 439162306a36Sopenharmony_ci 439262306a36Sopenharmony_ci /* Free old spare buffer and save old primary buffer as spare */ 439362306a36Sopenharmony_ci kfree(thresholds->spare); 439462306a36Sopenharmony_ci thresholds->spare = thresholds->primary; 439562306a36Sopenharmony_ci 439662306a36Sopenharmony_ci rcu_assign_pointer(thresholds->primary, new); 439762306a36Sopenharmony_ci 439862306a36Sopenharmony_ci /* To be sure that nobody uses thresholds */ 439962306a36Sopenharmony_ci synchronize_rcu(); 440062306a36Sopenharmony_ci 440162306a36Sopenharmony_ciunlock: 440262306a36Sopenharmony_ci mutex_unlock(&memcg->thresholds_lock); 440362306a36Sopenharmony_ci 440462306a36Sopenharmony_ci return ret; 440562306a36Sopenharmony_ci} 440662306a36Sopenharmony_ci 440762306a36Sopenharmony_cistatic int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 440862306a36Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 440962306a36Sopenharmony_ci{ 441062306a36Sopenharmony_ci return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 441162306a36Sopenharmony_ci} 441262306a36Sopenharmony_ci 441362306a36Sopenharmony_cistatic int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 441462306a36Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 441562306a36Sopenharmony_ci{ 441662306a36Sopenharmony_ci return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 441762306a36Sopenharmony_ci} 441862306a36Sopenharmony_ci 441962306a36Sopenharmony_cistatic void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 442062306a36Sopenharmony_ci struct eventfd_ctx *eventfd, enum res_type type) 442162306a36Sopenharmony_ci{ 442262306a36Sopenharmony_ci struct mem_cgroup_thresholds *thresholds; 442362306a36Sopenharmony_ci struct mem_cgroup_threshold_ary *new; 442462306a36Sopenharmony_ci unsigned long usage; 442562306a36Sopenharmony_ci int i, j, size, entries; 442662306a36Sopenharmony_ci 442762306a36Sopenharmony_ci mutex_lock(&memcg->thresholds_lock); 442862306a36Sopenharmony_ci 442962306a36Sopenharmony_ci if (type == _MEM) { 443062306a36Sopenharmony_ci thresholds = &memcg->thresholds; 443162306a36Sopenharmony_ci usage = mem_cgroup_usage(memcg, false); 443262306a36Sopenharmony_ci } else if (type == _MEMSWAP) { 443362306a36Sopenharmony_ci thresholds = &memcg->memsw_thresholds; 443462306a36Sopenharmony_ci usage = mem_cgroup_usage(memcg, true); 443562306a36Sopenharmony_ci } else 443662306a36Sopenharmony_ci BUG(); 443762306a36Sopenharmony_ci 443862306a36Sopenharmony_ci if (!thresholds->primary) 443962306a36Sopenharmony_ci goto unlock; 444062306a36Sopenharmony_ci 444162306a36Sopenharmony_ci /* Check if a threshold crossed before removing */ 444262306a36Sopenharmony_ci __mem_cgroup_threshold(memcg, type == _MEMSWAP); 444362306a36Sopenharmony_ci 444462306a36Sopenharmony_ci /* Calculate new number of threshold */ 444562306a36Sopenharmony_ci size = entries = 0; 444662306a36Sopenharmony_ci for (i = 0; i < thresholds->primary->size; i++) { 444762306a36Sopenharmony_ci if (thresholds->primary->entries[i].eventfd != eventfd) 444862306a36Sopenharmony_ci size++; 444962306a36Sopenharmony_ci else 445062306a36Sopenharmony_ci entries++; 445162306a36Sopenharmony_ci } 445262306a36Sopenharmony_ci 445362306a36Sopenharmony_ci new = thresholds->spare; 445462306a36Sopenharmony_ci 445562306a36Sopenharmony_ci /* If no items related to eventfd have been cleared, nothing to do */ 445662306a36Sopenharmony_ci if (!entries) 445762306a36Sopenharmony_ci goto unlock; 445862306a36Sopenharmony_ci 445962306a36Sopenharmony_ci /* Set thresholds array to NULL if we don't have thresholds */ 446062306a36Sopenharmony_ci if (!size) { 446162306a36Sopenharmony_ci kfree(new); 446262306a36Sopenharmony_ci new = NULL; 446362306a36Sopenharmony_ci goto swap_buffers; 446462306a36Sopenharmony_ci } 446562306a36Sopenharmony_ci 446662306a36Sopenharmony_ci new->size = size; 446762306a36Sopenharmony_ci 446862306a36Sopenharmony_ci /* Copy thresholds and find current threshold */ 446962306a36Sopenharmony_ci new->current_threshold = -1; 447062306a36Sopenharmony_ci for (i = 0, j = 0; i < thresholds->primary->size; i++) { 447162306a36Sopenharmony_ci if (thresholds->primary->entries[i].eventfd == eventfd) 447262306a36Sopenharmony_ci continue; 447362306a36Sopenharmony_ci 447462306a36Sopenharmony_ci new->entries[j] = thresholds->primary->entries[i]; 447562306a36Sopenharmony_ci if (new->entries[j].threshold <= usage) { 447662306a36Sopenharmony_ci /* 447762306a36Sopenharmony_ci * new->current_threshold will not be used 447862306a36Sopenharmony_ci * until rcu_assign_pointer(), so it's safe to increment 447962306a36Sopenharmony_ci * it here. 448062306a36Sopenharmony_ci */ 448162306a36Sopenharmony_ci ++new->current_threshold; 448262306a36Sopenharmony_ci } 448362306a36Sopenharmony_ci j++; 448462306a36Sopenharmony_ci } 448562306a36Sopenharmony_ci 448662306a36Sopenharmony_ciswap_buffers: 448762306a36Sopenharmony_ci /* Swap primary and spare array */ 448862306a36Sopenharmony_ci thresholds->spare = thresholds->primary; 448962306a36Sopenharmony_ci 449062306a36Sopenharmony_ci rcu_assign_pointer(thresholds->primary, new); 449162306a36Sopenharmony_ci 449262306a36Sopenharmony_ci /* To be sure that nobody uses thresholds */ 449362306a36Sopenharmony_ci synchronize_rcu(); 449462306a36Sopenharmony_ci 449562306a36Sopenharmony_ci /* If all events are unregistered, free the spare array */ 449662306a36Sopenharmony_ci if (!new) { 449762306a36Sopenharmony_ci kfree(thresholds->spare); 449862306a36Sopenharmony_ci thresholds->spare = NULL; 449962306a36Sopenharmony_ci } 450062306a36Sopenharmony_ciunlock: 450162306a36Sopenharmony_ci mutex_unlock(&memcg->thresholds_lock); 450262306a36Sopenharmony_ci} 450362306a36Sopenharmony_ci 450462306a36Sopenharmony_cistatic void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 450562306a36Sopenharmony_ci struct eventfd_ctx *eventfd) 450662306a36Sopenharmony_ci{ 450762306a36Sopenharmony_ci return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 450862306a36Sopenharmony_ci} 450962306a36Sopenharmony_ci 451062306a36Sopenharmony_cistatic void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 451162306a36Sopenharmony_ci struct eventfd_ctx *eventfd) 451262306a36Sopenharmony_ci{ 451362306a36Sopenharmony_ci return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 451462306a36Sopenharmony_ci} 451562306a36Sopenharmony_ci 451662306a36Sopenharmony_cistatic int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 451762306a36Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 451862306a36Sopenharmony_ci{ 451962306a36Sopenharmony_ci struct mem_cgroup_eventfd_list *event; 452062306a36Sopenharmony_ci 452162306a36Sopenharmony_ci event = kmalloc(sizeof(*event), GFP_KERNEL); 452262306a36Sopenharmony_ci if (!event) 452362306a36Sopenharmony_ci return -ENOMEM; 452462306a36Sopenharmony_ci 452562306a36Sopenharmony_ci spin_lock(&memcg_oom_lock); 452662306a36Sopenharmony_ci 452762306a36Sopenharmony_ci event->eventfd = eventfd; 452862306a36Sopenharmony_ci list_add(&event->list, &memcg->oom_notify); 452962306a36Sopenharmony_ci 453062306a36Sopenharmony_ci /* already in OOM ? */ 453162306a36Sopenharmony_ci if (memcg->under_oom) 453262306a36Sopenharmony_ci eventfd_signal(eventfd, 1); 453362306a36Sopenharmony_ci spin_unlock(&memcg_oom_lock); 453462306a36Sopenharmony_ci 453562306a36Sopenharmony_ci return 0; 453662306a36Sopenharmony_ci} 453762306a36Sopenharmony_ci 453862306a36Sopenharmony_cistatic void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 453962306a36Sopenharmony_ci struct eventfd_ctx *eventfd) 454062306a36Sopenharmony_ci{ 454162306a36Sopenharmony_ci struct mem_cgroup_eventfd_list *ev, *tmp; 454262306a36Sopenharmony_ci 454362306a36Sopenharmony_ci spin_lock(&memcg_oom_lock); 454462306a36Sopenharmony_ci 454562306a36Sopenharmony_ci list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 454662306a36Sopenharmony_ci if (ev->eventfd == eventfd) { 454762306a36Sopenharmony_ci list_del(&ev->list); 454862306a36Sopenharmony_ci kfree(ev); 454962306a36Sopenharmony_ci } 455062306a36Sopenharmony_ci } 455162306a36Sopenharmony_ci 455262306a36Sopenharmony_ci spin_unlock(&memcg_oom_lock); 455362306a36Sopenharmony_ci} 455462306a36Sopenharmony_ci 455562306a36Sopenharmony_cistatic int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 455662306a36Sopenharmony_ci{ 455762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 455862306a36Sopenharmony_ci 455962306a36Sopenharmony_ci seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 456062306a36Sopenharmony_ci seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 456162306a36Sopenharmony_ci seq_printf(sf, "oom_kill %lu\n", 456262306a36Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 456362306a36Sopenharmony_ci return 0; 456462306a36Sopenharmony_ci} 456562306a36Sopenharmony_ci 456662306a36Sopenharmony_cistatic int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 456762306a36Sopenharmony_ci struct cftype *cft, u64 val) 456862306a36Sopenharmony_ci{ 456962306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 457062306a36Sopenharmony_ci 457162306a36Sopenharmony_ci /* cannot set to root cgroup and only 0 and 1 are allowed */ 457262306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 457362306a36Sopenharmony_ci return -EINVAL; 457462306a36Sopenharmony_ci 457562306a36Sopenharmony_ci WRITE_ONCE(memcg->oom_kill_disable, val); 457662306a36Sopenharmony_ci if (!val) 457762306a36Sopenharmony_ci memcg_oom_recover(memcg); 457862306a36Sopenharmony_ci 457962306a36Sopenharmony_ci return 0; 458062306a36Sopenharmony_ci} 458162306a36Sopenharmony_ci 458262306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 458362306a36Sopenharmony_ci 458462306a36Sopenharmony_ci#include <trace/events/writeback.h> 458562306a36Sopenharmony_ci 458662306a36Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 458762306a36Sopenharmony_ci{ 458862306a36Sopenharmony_ci return wb_domain_init(&memcg->cgwb_domain, gfp); 458962306a36Sopenharmony_ci} 459062306a36Sopenharmony_ci 459162306a36Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg) 459262306a36Sopenharmony_ci{ 459362306a36Sopenharmony_ci wb_domain_exit(&memcg->cgwb_domain); 459462306a36Sopenharmony_ci} 459562306a36Sopenharmony_ci 459662306a36Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 459762306a36Sopenharmony_ci{ 459862306a36Sopenharmony_ci wb_domain_size_changed(&memcg->cgwb_domain); 459962306a36Sopenharmony_ci} 460062306a36Sopenharmony_ci 460162306a36Sopenharmony_cistruct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 460262306a36Sopenharmony_ci{ 460362306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 460462306a36Sopenharmony_ci 460562306a36Sopenharmony_ci if (!memcg->css.parent) 460662306a36Sopenharmony_ci return NULL; 460762306a36Sopenharmony_ci 460862306a36Sopenharmony_ci return &memcg->cgwb_domain; 460962306a36Sopenharmony_ci} 461062306a36Sopenharmony_ci 461162306a36Sopenharmony_ci/** 461262306a36Sopenharmony_ci * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 461362306a36Sopenharmony_ci * @wb: bdi_writeback in question 461462306a36Sopenharmony_ci * @pfilepages: out parameter for number of file pages 461562306a36Sopenharmony_ci * @pheadroom: out parameter for number of allocatable pages according to memcg 461662306a36Sopenharmony_ci * @pdirty: out parameter for number of dirty pages 461762306a36Sopenharmony_ci * @pwriteback: out parameter for number of pages under writeback 461862306a36Sopenharmony_ci * 461962306a36Sopenharmony_ci * Determine the numbers of file, headroom, dirty, and writeback pages in 462062306a36Sopenharmony_ci * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 462162306a36Sopenharmony_ci * is a bit more involved. 462262306a36Sopenharmony_ci * 462362306a36Sopenharmony_ci * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 462462306a36Sopenharmony_ci * headroom is calculated as the lowest headroom of itself and the 462562306a36Sopenharmony_ci * ancestors. Note that this doesn't consider the actual amount of 462662306a36Sopenharmony_ci * available memory in the system. The caller should further cap 462762306a36Sopenharmony_ci * *@pheadroom accordingly. 462862306a36Sopenharmony_ci */ 462962306a36Sopenharmony_civoid mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 463062306a36Sopenharmony_ci unsigned long *pheadroom, unsigned long *pdirty, 463162306a36Sopenharmony_ci unsigned long *pwriteback) 463262306a36Sopenharmony_ci{ 463362306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 463462306a36Sopenharmony_ci struct mem_cgroup *parent; 463562306a36Sopenharmony_ci 463662306a36Sopenharmony_ci mem_cgroup_flush_stats(); 463762306a36Sopenharmony_ci 463862306a36Sopenharmony_ci *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 463962306a36Sopenharmony_ci *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 464062306a36Sopenharmony_ci *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 464162306a36Sopenharmony_ci memcg_page_state(memcg, NR_ACTIVE_FILE); 464262306a36Sopenharmony_ci 464362306a36Sopenharmony_ci *pheadroom = PAGE_COUNTER_MAX; 464462306a36Sopenharmony_ci while ((parent = parent_mem_cgroup(memcg))) { 464562306a36Sopenharmony_ci unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 464662306a36Sopenharmony_ci READ_ONCE(memcg->memory.high)); 464762306a36Sopenharmony_ci unsigned long used = page_counter_read(&memcg->memory); 464862306a36Sopenharmony_ci 464962306a36Sopenharmony_ci *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 465062306a36Sopenharmony_ci memcg = parent; 465162306a36Sopenharmony_ci } 465262306a36Sopenharmony_ci} 465362306a36Sopenharmony_ci 465462306a36Sopenharmony_ci/* 465562306a36Sopenharmony_ci * Foreign dirty flushing 465662306a36Sopenharmony_ci * 465762306a36Sopenharmony_ci * There's an inherent mismatch between memcg and writeback. The former 465862306a36Sopenharmony_ci * tracks ownership per-page while the latter per-inode. This was a 465962306a36Sopenharmony_ci * deliberate design decision because honoring per-page ownership in the 466062306a36Sopenharmony_ci * writeback path is complicated, may lead to higher CPU and IO overheads 466162306a36Sopenharmony_ci * and deemed unnecessary given that write-sharing an inode across 466262306a36Sopenharmony_ci * different cgroups isn't a common use-case. 466362306a36Sopenharmony_ci * 466462306a36Sopenharmony_ci * Combined with inode majority-writer ownership switching, this works well 466562306a36Sopenharmony_ci * enough in most cases but there are some pathological cases. For 466662306a36Sopenharmony_ci * example, let's say there are two cgroups A and B which keep writing to 466762306a36Sopenharmony_ci * different but confined parts of the same inode. B owns the inode and 466862306a36Sopenharmony_ci * A's memory is limited far below B's. A's dirty ratio can rise enough to 466962306a36Sopenharmony_ci * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 467062306a36Sopenharmony_ci * triggering background writeback. A will be slowed down without a way to 467162306a36Sopenharmony_ci * make writeback of the dirty pages happen. 467262306a36Sopenharmony_ci * 467362306a36Sopenharmony_ci * Conditions like the above can lead to a cgroup getting repeatedly and 467462306a36Sopenharmony_ci * severely throttled after making some progress after each 467562306a36Sopenharmony_ci * dirty_expire_interval while the underlying IO device is almost 467662306a36Sopenharmony_ci * completely idle. 467762306a36Sopenharmony_ci * 467862306a36Sopenharmony_ci * Solving this problem completely requires matching the ownership tracking 467962306a36Sopenharmony_ci * granularities between memcg and writeback in either direction. However, 468062306a36Sopenharmony_ci * the more egregious behaviors can be avoided by simply remembering the 468162306a36Sopenharmony_ci * most recent foreign dirtying events and initiating remote flushes on 468262306a36Sopenharmony_ci * them when local writeback isn't enough to keep the memory clean enough. 468362306a36Sopenharmony_ci * 468462306a36Sopenharmony_ci * The following two functions implement such mechanism. When a foreign 468562306a36Sopenharmony_ci * page - a page whose memcg and writeback ownerships don't match - is 468662306a36Sopenharmony_ci * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 468762306a36Sopenharmony_ci * bdi_writeback on the page owning memcg. When balance_dirty_pages() 468862306a36Sopenharmony_ci * decides that the memcg needs to sleep due to high dirty ratio, it calls 468962306a36Sopenharmony_ci * mem_cgroup_flush_foreign() which queues writeback on the recorded 469062306a36Sopenharmony_ci * foreign bdi_writebacks which haven't expired. Both the numbers of 469162306a36Sopenharmony_ci * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 469262306a36Sopenharmony_ci * limited to MEMCG_CGWB_FRN_CNT. 469362306a36Sopenharmony_ci * 469462306a36Sopenharmony_ci * The mechanism only remembers IDs and doesn't hold any object references. 469562306a36Sopenharmony_ci * As being wrong occasionally doesn't matter, updates and accesses to the 469662306a36Sopenharmony_ci * records are lockless and racy. 469762306a36Sopenharmony_ci */ 469862306a36Sopenharmony_civoid mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, 469962306a36Sopenharmony_ci struct bdi_writeback *wb) 470062306a36Sopenharmony_ci{ 470162306a36Sopenharmony_ci struct mem_cgroup *memcg = folio_memcg(folio); 470262306a36Sopenharmony_ci struct memcg_cgwb_frn *frn; 470362306a36Sopenharmony_ci u64 now = get_jiffies_64(); 470462306a36Sopenharmony_ci u64 oldest_at = now; 470562306a36Sopenharmony_ci int oldest = -1; 470662306a36Sopenharmony_ci int i; 470762306a36Sopenharmony_ci 470862306a36Sopenharmony_ci trace_track_foreign_dirty(folio, wb); 470962306a36Sopenharmony_ci 471062306a36Sopenharmony_ci /* 471162306a36Sopenharmony_ci * Pick the slot to use. If there is already a slot for @wb, keep 471262306a36Sopenharmony_ci * using it. If not replace the oldest one which isn't being 471362306a36Sopenharmony_ci * written out. 471462306a36Sopenharmony_ci */ 471562306a36Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 471662306a36Sopenharmony_ci frn = &memcg->cgwb_frn[i]; 471762306a36Sopenharmony_ci if (frn->bdi_id == wb->bdi->id && 471862306a36Sopenharmony_ci frn->memcg_id == wb->memcg_css->id) 471962306a36Sopenharmony_ci break; 472062306a36Sopenharmony_ci if (time_before64(frn->at, oldest_at) && 472162306a36Sopenharmony_ci atomic_read(&frn->done.cnt) == 1) { 472262306a36Sopenharmony_ci oldest = i; 472362306a36Sopenharmony_ci oldest_at = frn->at; 472462306a36Sopenharmony_ci } 472562306a36Sopenharmony_ci } 472662306a36Sopenharmony_ci 472762306a36Sopenharmony_ci if (i < MEMCG_CGWB_FRN_CNT) { 472862306a36Sopenharmony_ci /* 472962306a36Sopenharmony_ci * Re-using an existing one. Update timestamp lazily to 473062306a36Sopenharmony_ci * avoid making the cacheline hot. We want them to be 473162306a36Sopenharmony_ci * reasonably up-to-date and significantly shorter than 473262306a36Sopenharmony_ci * dirty_expire_interval as that's what expires the record. 473362306a36Sopenharmony_ci * Use the shorter of 1s and dirty_expire_interval / 8. 473462306a36Sopenharmony_ci */ 473562306a36Sopenharmony_ci unsigned long update_intv = 473662306a36Sopenharmony_ci min_t(unsigned long, HZ, 473762306a36Sopenharmony_ci msecs_to_jiffies(dirty_expire_interval * 10) / 8); 473862306a36Sopenharmony_ci 473962306a36Sopenharmony_ci if (time_before64(frn->at, now - update_intv)) 474062306a36Sopenharmony_ci frn->at = now; 474162306a36Sopenharmony_ci } else if (oldest >= 0) { 474262306a36Sopenharmony_ci /* replace the oldest free one */ 474362306a36Sopenharmony_ci frn = &memcg->cgwb_frn[oldest]; 474462306a36Sopenharmony_ci frn->bdi_id = wb->bdi->id; 474562306a36Sopenharmony_ci frn->memcg_id = wb->memcg_css->id; 474662306a36Sopenharmony_ci frn->at = now; 474762306a36Sopenharmony_ci } 474862306a36Sopenharmony_ci} 474962306a36Sopenharmony_ci 475062306a36Sopenharmony_ci/* issue foreign writeback flushes for recorded foreign dirtying events */ 475162306a36Sopenharmony_civoid mem_cgroup_flush_foreign(struct bdi_writeback *wb) 475262306a36Sopenharmony_ci{ 475362306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 475462306a36Sopenharmony_ci unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 475562306a36Sopenharmony_ci u64 now = jiffies_64; 475662306a36Sopenharmony_ci int i; 475762306a36Sopenharmony_ci 475862306a36Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 475962306a36Sopenharmony_ci struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 476062306a36Sopenharmony_ci 476162306a36Sopenharmony_ci /* 476262306a36Sopenharmony_ci * If the record is older than dirty_expire_interval, 476362306a36Sopenharmony_ci * writeback on it has already started. No need to kick it 476462306a36Sopenharmony_ci * off again. Also, don't start a new one if there's 476562306a36Sopenharmony_ci * already one in flight. 476662306a36Sopenharmony_ci */ 476762306a36Sopenharmony_ci if (time_after64(frn->at, now - intv) && 476862306a36Sopenharmony_ci atomic_read(&frn->done.cnt) == 1) { 476962306a36Sopenharmony_ci frn->at = 0; 477062306a36Sopenharmony_ci trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 477162306a36Sopenharmony_ci cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 477262306a36Sopenharmony_ci WB_REASON_FOREIGN_FLUSH, 477362306a36Sopenharmony_ci &frn->done); 477462306a36Sopenharmony_ci } 477562306a36Sopenharmony_ci } 477662306a36Sopenharmony_ci} 477762306a36Sopenharmony_ci 477862306a36Sopenharmony_ci#else /* CONFIG_CGROUP_WRITEBACK */ 477962306a36Sopenharmony_ci 478062306a36Sopenharmony_cistatic int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 478162306a36Sopenharmony_ci{ 478262306a36Sopenharmony_ci return 0; 478362306a36Sopenharmony_ci} 478462306a36Sopenharmony_ci 478562306a36Sopenharmony_cistatic void memcg_wb_domain_exit(struct mem_cgroup *memcg) 478662306a36Sopenharmony_ci{ 478762306a36Sopenharmony_ci} 478862306a36Sopenharmony_ci 478962306a36Sopenharmony_cistatic void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 479062306a36Sopenharmony_ci{ 479162306a36Sopenharmony_ci} 479262306a36Sopenharmony_ci 479362306a36Sopenharmony_ci#endif /* CONFIG_CGROUP_WRITEBACK */ 479462306a36Sopenharmony_ci 479562306a36Sopenharmony_ci/* 479662306a36Sopenharmony_ci * DO NOT USE IN NEW FILES. 479762306a36Sopenharmony_ci * 479862306a36Sopenharmony_ci * "cgroup.event_control" implementation. 479962306a36Sopenharmony_ci * 480062306a36Sopenharmony_ci * This is way over-engineered. It tries to support fully configurable 480162306a36Sopenharmony_ci * events for each user. Such level of flexibility is completely 480262306a36Sopenharmony_ci * unnecessary especially in the light of the planned unified hierarchy. 480362306a36Sopenharmony_ci * 480462306a36Sopenharmony_ci * Please deprecate this and replace with something simpler if at all 480562306a36Sopenharmony_ci * possible. 480662306a36Sopenharmony_ci */ 480762306a36Sopenharmony_ci 480862306a36Sopenharmony_ci/* 480962306a36Sopenharmony_ci * Unregister event and free resources. 481062306a36Sopenharmony_ci * 481162306a36Sopenharmony_ci * Gets called from workqueue. 481262306a36Sopenharmony_ci */ 481362306a36Sopenharmony_cistatic void memcg_event_remove(struct work_struct *work) 481462306a36Sopenharmony_ci{ 481562306a36Sopenharmony_ci struct mem_cgroup_event *event = 481662306a36Sopenharmony_ci container_of(work, struct mem_cgroup_event, remove); 481762306a36Sopenharmony_ci struct mem_cgroup *memcg = event->memcg; 481862306a36Sopenharmony_ci 481962306a36Sopenharmony_ci remove_wait_queue(event->wqh, &event->wait); 482062306a36Sopenharmony_ci 482162306a36Sopenharmony_ci event->unregister_event(memcg, event->eventfd); 482262306a36Sopenharmony_ci 482362306a36Sopenharmony_ci /* Notify userspace the event is going away. */ 482462306a36Sopenharmony_ci eventfd_signal(event->eventfd, 1); 482562306a36Sopenharmony_ci 482662306a36Sopenharmony_ci eventfd_ctx_put(event->eventfd); 482762306a36Sopenharmony_ci kfree(event); 482862306a36Sopenharmony_ci css_put(&memcg->css); 482962306a36Sopenharmony_ci} 483062306a36Sopenharmony_ci 483162306a36Sopenharmony_ci/* 483262306a36Sopenharmony_ci * Gets called on EPOLLHUP on eventfd when user closes it. 483362306a36Sopenharmony_ci * 483462306a36Sopenharmony_ci * Called with wqh->lock held and interrupts disabled. 483562306a36Sopenharmony_ci */ 483662306a36Sopenharmony_cistatic int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 483762306a36Sopenharmony_ci int sync, void *key) 483862306a36Sopenharmony_ci{ 483962306a36Sopenharmony_ci struct mem_cgroup_event *event = 484062306a36Sopenharmony_ci container_of(wait, struct mem_cgroup_event, wait); 484162306a36Sopenharmony_ci struct mem_cgroup *memcg = event->memcg; 484262306a36Sopenharmony_ci __poll_t flags = key_to_poll(key); 484362306a36Sopenharmony_ci 484462306a36Sopenharmony_ci if (flags & EPOLLHUP) { 484562306a36Sopenharmony_ci /* 484662306a36Sopenharmony_ci * If the event has been detached at cgroup removal, we 484762306a36Sopenharmony_ci * can simply return knowing the other side will cleanup 484862306a36Sopenharmony_ci * for us. 484962306a36Sopenharmony_ci * 485062306a36Sopenharmony_ci * We can't race against event freeing since the other 485162306a36Sopenharmony_ci * side will require wqh->lock via remove_wait_queue(), 485262306a36Sopenharmony_ci * which we hold. 485362306a36Sopenharmony_ci */ 485462306a36Sopenharmony_ci spin_lock(&memcg->event_list_lock); 485562306a36Sopenharmony_ci if (!list_empty(&event->list)) { 485662306a36Sopenharmony_ci list_del_init(&event->list); 485762306a36Sopenharmony_ci /* 485862306a36Sopenharmony_ci * We are in atomic context, but cgroup_event_remove() 485962306a36Sopenharmony_ci * may sleep, so we have to call it in workqueue. 486062306a36Sopenharmony_ci */ 486162306a36Sopenharmony_ci schedule_work(&event->remove); 486262306a36Sopenharmony_ci } 486362306a36Sopenharmony_ci spin_unlock(&memcg->event_list_lock); 486462306a36Sopenharmony_ci } 486562306a36Sopenharmony_ci 486662306a36Sopenharmony_ci return 0; 486762306a36Sopenharmony_ci} 486862306a36Sopenharmony_ci 486962306a36Sopenharmony_cistatic void memcg_event_ptable_queue_proc(struct file *file, 487062306a36Sopenharmony_ci wait_queue_head_t *wqh, poll_table *pt) 487162306a36Sopenharmony_ci{ 487262306a36Sopenharmony_ci struct mem_cgroup_event *event = 487362306a36Sopenharmony_ci container_of(pt, struct mem_cgroup_event, pt); 487462306a36Sopenharmony_ci 487562306a36Sopenharmony_ci event->wqh = wqh; 487662306a36Sopenharmony_ci add_wait_queue(wqh, &event->wait); 487762306a36Sopenharmony_ci} 487862306a36Sopenharmony_ci 487962306a36Sopenharmony_ci/* 488062306a36Sopenharmony_ci * DO NOT USE IN NEW FILES. 488162306a36Sopenharmony_ci * 488262306a36Sopenharmony_ci * Parse input and register new cgroup event handler. 488362306a36Sopenharmony_ci * 488462306a36Sopenharmony_ci * Input must be in format '<event_fd> <control_fd> <args>'. 488562306a36Sopenharmony_ci * Interpretation of args is defined by control file implementation. 488662306a36Sopenharmony_ci */ 488762306a36Sopenharmony_cistatic ssize_t memcg_write_event_control(struct kernfs_open_file *of, 488862306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 488962306a36Sopenharmony_ci{ 489062306a36Sopenharmony_ci struct cgroup_subsys_state *css = of_css(of); 489162306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 489262306a36Sopenharmony_ci struct mem_cgroup_event *event; 489362306a36Sopenharmony_ci struct cgroup_subsys_state *cfile_css; 489462306a36Sopenharmony_ci unsigned int efd, cfd; 489562306a36Sopenharmony_ci struct fd efile; 489662306a36Sopenharmony_ci struct fd cfile; 489762306a36Sopenharmony_ci struct dentry *cdentry; 489862306a36Sopenharmony_ci const char *name; 489962306a36Sopenharmony_ci char *endp; 490062306a36Sopenharmony_ci int ret; 490162306a36Sopenharmony_ci 490262306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_PREEMPT_RT)) 490362306a36Sopenharmony_ci return -EOPNOTSUPP; 490462306a36Sopenharmony_ci 490562306a36Sopenharmony_ci buf = strstrip(buf); 490662306a36Sopenharmony_ci 490762306a36Sopenharmony_ci efd = simple_strtoul(buf, &endp, 10); 490862306a36Sopenharmony_ci if (*endp != ' ') 490962306a36Sopenharmony_ci return -EINVAL; 491062306a36Sopenharmony_ci buf = endp + 1; 491162306a36Sopenharmony_ci 491262306a36Sopenharmony_ci cfd = simple_strtoul(buf, &endp, 10); 491362306a36Sopenharmony_ci if ((*endp != ' ') && (*endp != '\0')) 491462306a36Sopenharmony_ci return -EINVAL; 491562306a36Sopenharmony_ci buf = endp + 1; 491662306a36Sopenharmony_ci 491762306a36Sopenharmony_ci event = kzalloc(sizeof(*event), GFP_KERNEL); 491862306a36Sopenharmony_ci if (!event) 491962306a36Sopenharmony_ci return -ENOMEM; 492062306a36Sopenharmony_ci 492162306a36Sopenharmony_ci event->memcg = memcg; 492262306a36Sopenharmony_ci INIT_LIST_HEAD(&event->list); 492362306a36Sopenharmony_ci init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 492462306a36Sopenharmony_ci init_waitqueue_func_entry(&event->wait, memcg_event_wake); 492562306a36Sopenharmony_ci INIT_WORK(&event->remove, memcg_event_remove); 492662306a36Sopenharmony_ci 492762306a36Sopenharmony_ci efile = fdget(efd); 492862306a36Sopenharmony_ci if (!efile.file) { 492962306a36Sopenharmony_ci ret = -EBADF; 493062306a36Sopenharmony_ci goto out_kfree; 493162306a36Sopenharmony_ci } 493262306a36Sopenharmony_ci 493362306a36Sopenharmony_ci event->eventfd = eventfd_ctx_fileget(efile.file); 493462306a36Sopenharmony_ci if (IS_ERR(event->eventfd)) { 493562306a36Sopenharmony_ci ret = PTR_ERR(event->eventfd); 493662306a36Sopenharmony_ci goto out_put_efile; 493762306a36Sopenharmony_ci } 493862306a36Sopenharmony_ci 493962306a36Sopenharmony_ci cfile = fdget(cfd); 494062306a36Sopenharmony_ci if (!cfile.file) { 494162306a36Sopenharmony_ci ret = -EBADF; 494262306a36Sopenharmony_ci goto out_put_eventfd; 494362306a36Sopenharmony_ci } 494462306a36Sopenharmony_ci 494562306a36Sopenharmony_ci /* the process need read permission on control file */ 494662306a36Sopenharmony_ci /* AV: shouldn't we check that it's been opened for read instead? */ 494762306a36Sopenharmony_ci ret = file_permission(cfile.file, MAY_READ); 494862306a36Sopenharmony_ci if (ret < 0) 494962306a36Sopenharmony_ci goto out_put_cfile; 495062306a36Sopenharmony_ci 495162306a36Sopenharmony_ci /* 495262306a36Sopenharmony_ci * The control file must be a regular cgroup1 file. As a regular cgroup 495362306a36Sopenharmony_ci * file can't be renamed, it's safe to access its name afterwards. 495462306a36Sopenharmony_ci */ 495562306a36Sopenharmony_ci cdentry = cfile.file->f_path.dentry; 495662306a36Sopenharmony_ci if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 495762306a36Sopenharmony_ci ret = -EINVAL; 495862306a36Sopenharmony_ci goto out_put_cfile; 495962306a36Sopenharmony_ci } 496062306a36Sopenharmony_ci 496162306a36Sopenharmony_ci /* 496262306a36Sopenharmony_ci * Determine the event callbacks and set them in @event. This used 496362306a36Sopenharmony_ci * to be done via struct cftype but cgroup core no longer knows 496462306a36Sopenharmony_ci * about these events. The following is crude but the whole thing 496562306a36Sopenharmony_ci * is for compatibility anyway. 496662306a36Sopenharmony_ci * 496762306a36Sopenharmony_ci * DO NOT ADD NEW FILES. 496862306a36Sopenharmony_ci */ 496962306a36Sopenharmony_ci name = cdentry->d_name.name; 497062306a36Sopenharmony_ci 497162306a36Sopenharmony_ci if (!strcmp(name, "memory.usage_in_bytes")) { 497262306a36Sopenharmony_ci event->register_event = mem_cgroup_usage_register_event; 497362306a36Sopenharmony_ci event->unregister_event = mem_cgroup_usage_unregister_event; 497462306a36Sopenharmony_ci } else if (!strcmp(name, "memory.oom_control")) { 497562306a36Sopenharmony_ci event->register_event = mem_cgroup_oom_register_event; 497662306a36Sopenharmony_ci event->unregister_event = mem_cgroup_oom_unregister_event; 497762306a36Sopenharmony_ci } else if (!strcmp(name, "memory.pressure_level")) { 497862306a36Sopenharmony_ci event->register_event = vmpressure_register_event; 497962306a36Sopenharmony_ci event->unregister_event = vmpressure_unregister_event; 498062306a36Sopenharmony_ci } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 498162306a36Sopenharmony_ci event->register_event = memsw_cgroup_usage_register_event; 498262306a36Sopenharmony_ci event->unregister_event = memsw_cgroup_usage_unregister_event; 498362306a36Sopenharmony_ci } else { 498462306a36Sopenharmony_ci ret = -EINVAL; 498562306a36Sopenharmony_ci goto out_put_cfile; 498662306a36Sopenharmony_ci } 498762306a36Sopenharmony_ci 498862306a36Sopenharmony_ci /* 498962306a36Sopenharmony_ci * Verify @cfile should belong to @css. Also, remaining events are 499062306a36Sopenharmony_ci * automatically removed on cgroup destruction but the removal is 499162306a36Sopenharmony_ci * asynchronous, so take an extra ref on @css. 499262306a36Sopenharmony_ci */ 499362306a36Sopenharmony_ci cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 499462306a36Sopenharmony_ci &memory_cgrp_subsys); 499562306a36Sopenharmony_ci ret = -EINVAL; 499662306a36Sopenharmony_ci if (IS_ERR(cfile_css)) 499762306a36Sopenharmony_ci goto out_put_cfile; 499862306a36Sopenharmony_ci if (cfile_css != css) { 499962306a36Sopenharmony_ci css_put(cfile_css); 500062306a36Sopenharmony_ci goto out_put_cfile; 500162306a36Sopenharmony_ci } 500262306a36Sopenharmony_ci 500362306a36Sopenharmony_ci ret = event->register_event(memcg, event->eventfd, buf); 500462306a36Sopenharmony_ci if (ret) 500562306a36Sopenharmony_ci goto out_put_css; 500662306a36Sopenharmony_ci 500762306a36Sopenharmony_ci vfs_poll(efile.file, &event->pt); 500862306a36Sopenharmony_ci 500962306a36Sopenharmony_ci spin_lock_irq(&memcg->event_list_lock); 501062306a36Sopenharmony_ci list_add(&event->list, &memcg->event_list); 501162306a36Sopenharmony_ci spin_unlock_irq(&memcg->event_list_lock); 501262306a36Sopenharmony_ci 501362306a36Sopenharmony_ci fdput(cfile); 501462306a36Sopenharmony_ci fdput(efile); 501562306a36Sopenharmony_ci 501662306a36Sopenharmony_ci return nbytes; 501762306a36Sopenharmony_ci 501862306a36Sopenharmony_ciout_put_css: 501962306a36Sopenharmony_ci css_put(css); 502062306a36Sopenharmony_ciout_put_cfile: 502162306a36Sopenharmony_ci fdput(cfile); 502262306a36Sopenharmony_ciout_put_eventfd: 502362306a36Sopenharmony_ci eventfd_ctx_put(event->eventfd); 502462306a36Sopenharmony_ciout_put_efile: 502562306a36Sopenharmony_ci fdput(efile); 502662306a36Sopenharmony_ciout_kfree: 502762306a36Sopenharmony_ci kfree(event); 502862306a36Sopenharmony_ci 502962306a36Sopenharmony_ci return ret; 503062306a36Sopenharmony_ci} 503162306a36Sopenharmony_ci 503262306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 503362306a36Sopenharmony_cistatic int mem_cgroup_slab_show(struct seq_file *m, void *p) 503462306a36Sopenharmony_ci{ 503562306a36Sopenharmony_ci /* 503662306a36Sopenharmony_ci * Deprecated. 503762306a36Sopenharmony_ci * Please, take a look at tools/cgroup/memcg_slabinfo.py . 503862306a36Sopenharmony_ci */ 503962306a36Sopenharmony_ci return 0; 504062306a36Sopenharmony_ci} 504162306a36Sopenharmony_ci#endif 504262306a36Sopenharmony_ci 504362306a36Sopenharmony_cistatic int memory_stat_show(struct seq_file *m, void *v); 504462306a36Sopenharmony_ci 504562306a36Sopenharmony_cistatic struct cftype mem_cgroup_legacy_files[] = { 504662306a36Sopenharmony_ci { 504762306a36Sopenharmony_ci .name = "usage_in_bytes", 504862306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 504962306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 505062306a36Sopenharmony_ci }, 505162306a36Sopenharmony_ci { 505262306a36Sopenharmony_ci .name = "max_usage_in_bytes", 505362306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 505462306a36Sopenharmony_ci .write = mem_cgroup_reset, 505562306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 505662306a36Sopenharmony_ci }, 505762306a36Sopenharmony_ci { 505862306a36Sopenharmony_ci .name = "limit_in_bytes", 505962306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 506062306a36Sopenharmony_ci .write = mem_cgroup_write, 506162306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 506262306a36Sopenharmony_ci }, 506362306a36Sopenharmony_ci { 506462306a36Sopenharmony_ci .name = "soft_limit_in_bytes", 506562306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 506662306a36Sopenharmony_ci .write = mem_cgroup_write, 506762306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 506862306a36Sopenharmony_ci }, 506962306a36Sopenharmony_ci { 507062306a36Sopenharmony_ci .name = "failcnt", 507162306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 507262306a36Sopenharmony_ci .write = mem_cgroup_reset, 507362306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 507462306a36Sopenharmony_ci }, 507562306a36Sopenharmony_ci { 507662306a36Sopenharmony_ci .name = "stat", 507762306a36Sopenharmony_ci .seq_show = memory_stat_show, 507862306a36Sopenharmony_ci }, 507962306a36Sopenharmony_ci { 508062306a36Sopenharmony_ci .name = "force_empty", 508162306a36Sopenharmony_ci .write = mem_cgroup_force_empty_write, 508262306a36Sopenharmony_ci }, 508362306a36Sopenharmony_ci { 508462306a36Sopenharmony_ci .name = "use_hierarchy", 508562306a36Sopenharmony_ci .write_u64 = mem_cgroup_hierarchy_write, 508662306a36Sopenharmony_ci .read_u64 = mem_cgroup_hierarchy_read, 508762306a36Sopenharmony_ci }, 508862306a36Sopenharmony_ci { 508962306a36Sopenharmony_ci .name = "cgroup.event_control", /* XXX: for compat */ 509062306a36Sopenharmony_ci .write = memcg_write_event_control, 509162306a36Sopenharmony_ci .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 509262306a36Sopenharmony_ci }, 509362306a36Sopenharmony_ci { 509462306a36Sopenharmony_ci .name = "swappiness", 509562306a36Sopenharmony_ci .read_u64 = mem_cgroup_swappiness_read, 509662306a36Sopenharmony_ci .write_u64 = mem_cgroup_swappiness_write, 509762306a36Sopenharmony_ci }, 509862306a36Sopenharmony_ci { 509962306a36Sopenharmony_ci .name = "move_charge_at_immigrate", 510062306a36Sopenharmony_ci .read_u64 = mem_cgroup_move_charge_read, 510162306a36Sopenharmony_ci .write_u64 = mem_cgroup_move_charge_write, 510262306a36Sopenharmony_ci }, 510362306a36Sopenharmony_ci { 510462306a36Sopenharmony_ci .name = "oom_control", 510562306a36Sopenharmony_ci .seq_show = mem_cgroup_oom_control_read, 510662306a36Sopenharmony_ci .write_u64 = mem_cgroup_oom_control_write, 510762306a36Sopenharmony_ci }, 510862306a36Sopenharmony_ci { 510962306a36Sopenharmony_ci .name = "pressure_level", 511062306a36Sopenharmony_ci .seq_show = mem_cgroup_dummy_seq_show, 511162306a36Sopenharmony_ci }, 511262306a36Sopenharmony_ci#ifdef CONFIG_NUMA 511362306a36Sopenharmony_ci { 511462306a36Sopenharmony_ci .name = "numa_stat", 511562306a36Sopenharmony_ci .seq_show = memcg_numa_stat_show, 511662306a36Sopenharmony_ci }, 511762306a36Sopenharmony_ci#endif 511862306a36Sopenharmony_ci { 511962306a36Sopenharmony_ci .name = "kmem.limit_in_bytes", 512062306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 512162306a36Sopenharmony_ci .write = mem_cgroup_write, 512262306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 512362306a36Sopenharmony_ci }, 512462306a36Sopenharmony_ci { 512562306a36Sopenharmony_ci .name = "kmem.usage_in_bytes", 512662306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 512762306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 512862306a36Sopenharmony_ci }, 512962306a36Sopenharmony_ci { 513062306a36Sopenharmony_ci .name = "kmem.failcnt", 513162306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 513262306a36Sopenharmony_ci .write = mem_cgroup_reset, 513362306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 513462306a36Sopenharmony_ci }, 513562306a36Sopenharmony_ci { 513662306a36Sopenharmony_ci .name = "kmem.max_usage_in_bytes", 513762306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 513862306a36Sopenharmony_ci .write = mem_cgroup_reset, 513962306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 514062306a36Sopenharmony_ci }, 514162306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && \ 514262306a36Sopenharmony_ci (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 514362306a36Sopenharmony_ci { 514462306a36Sopenharmony_ci .name = "kmem.slabinfo", 514562306a36Sopenharmony_ci .seq_show = mem_cgroup_slab_show, 514662306a36Sopenharmony_ci }, 514762306a36Sopenharmony_ci#endif 514862306a36Sopenharmony_ci { 514962306a36Sopenharmony_ci .name = "kmem.tcp.limit_in_bytes", 515062306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 515162306a36Sopenharmony_ci .write = mem_cgroup_write, 515262306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 515362306a36Sopenharmony_ci }, 515462306a36Sopenharmony_ci { 515562306a36Sopenharmony_ci .name = "kmem.tcp.usage_in_bytes", 515662306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 515762306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 515862306a36Sopenharmony_ci }, 515962306a36Sopenharmony_ci { 516062306a36Sopenharmony_ci .name = "kmem.tcp.failcnt", 516162306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 516262306a36Sopenharmony_ci .write = mem_cgroup_reset, 516362306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 516462306a36Sopenharmony_ci }, 516562306a36Sopenharmony_ci { 516662306a36Sopenharmony_ci .name = "kmem.tcp.max_usage_in_bytes", 516762306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 516862306a36Sopenharmony_ci .write = mem_cgroup_reset, 516962306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 517062306a36Sopenharmony_ci }, 517162306a36Sopenharmony_ci { }, /* terminate */ 517262306a36Sopenharmony_ci}; 517362306a36Sopenharmony_ci 517462306a36Sopenharmony_ci/* 517562306a36Sopenharmony_ci * Private memory cgroup IDR 517662306a36Sopenharmony_ci * 517762306a36Sopenharmony_ci * Swap-out records and page cache shadow entries need to store memcg 517862306a36Sopenharmony_ci * references in constrained space, so we maintain an ID space that is 517962306a36Sopenharmony_ci * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 518062306a36Sopenharmony_ci * memory-controlled cgroups to 64k. 518162306a36Sopenharmony_ci * 518262306a36Sopenharmony_ci * However, there usually are many references to the offline CSS after 518362306a36Sopenharmony_ci * the cgroup has been destroyed, such as page cache or reclaimable 518462306a36Sopenharmony_ci * slab objects, that don't need to hang on to the ID. We want to keep 518562306a36Sopenharmony_ci * those dead CSS from occupying IDs, or we might quickly exhaust the 518662306a36Sopenharmony_ci * relatively small ID space and prevent the creation of new cgroups 518762306a36Sopenharmony_ci * even when there are much fewer than 64k cgroups - possibly none. 518862306a36Sopenharmony_ci * 518962306a36Sopenharmony_ci * Maintain a private 16-bit ID space for memcg, and allow the ID to 519062306a36Sopenharmony_ci * be freed and recycled when it's no longer needed, which is usually 519162306a36Sopenharmony_ci * when the CSS is offlined. 519262306a36Sopenharmony_ci * 519362306a36Sopenharmony_ci * The only exception to that are records of swapped out tmpfs/shmem 519462306a36Sopenharmony_ci * pages that need to be attributed to live ancestors on swapin. But 519562306a36Sopenharmony_ci * those references are manageable from userspace. 519662306a36Sopenharmony_ci */ 519762306a36Sopenharmony_ci 519862306a36Sopenharmony_ci#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) 519962306a36Sopenharmony_cistatic DEFINE_IDR(mem_cgroup_idr); 520062306a36Sopenharmony_ci 520162306a36Sopenharmony_cistatic void mem_cgroup_id_remove(struct mem_cgroup *memcg) 520262306a36Sopenharmony_ci{ 520362306a36Sopenharmony_ci if (memcg->id.id > 0) { 520462306a36Sopenharmony_ci idr_remove(&mem_cgroup_idr, memcg->id.id); 520562306a36Sopenharmony_ci memcg->id.id = 0; 520662306a36Sopenharmony_ci } 520762306a36Sopenharmony_ci} 520862306a36Sopenharmony_ci 520962306a36Sopenharmony_cistatic void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 521062306a36Sopenharmony_ci unsigned int n) 521162306a36Sopenharmony_ci{ 521262306a36Sopenharmony_ci refcount_add(n, &memcg->id.ref); 521362306a36Sopenharmony_ci} 521462306a36Sopenharmony_ci 521562306a36Sopenharmony_cistatic void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 521662306a36Sopenharmony_ci{ 521762306a36Sopenharmony_ci if (refcount_sub_and_test(n, &memcg->id.ref)) { 521862306a36Sopenharmony_ci mem_cgroup_id_remove(memcg); 521962306a36Sopenharmony_ci 522062306a36Sopenharmony_ci /* Memcg ID pins CSS */ 522162306a36Sopenharmony_ci css_put(&memcg->css); 522262306a36Sopenharmony_ci } 522362306a36Sopenharmony_ci} 522462306a36Sopenharmony_ci 522562306a36Sopenharmony_cistatic inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 522662306a36Sopenharmony_ci{ 522762306a36Sopenharmony_ci mem_cgroup_id_put_many(memcg, 1); 522862306a36Sopenharmony_ci} 522962306a36Sopenharmony_ci 523062306a36Sopenharmony_ci/** 523162306a36Sopenharmony_ci * mem_cgroup_from_id - look up a memcg from a memcg id 523262306a36Sopenharmony_ci * @id: the memcg id to look up 523362306a36Sopenharmony_ci * 523462306a36Sopenharmony_ci * Caller must hold rcu_read_lock(). 523562306a36Sopenharmony_ci */ 523662306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_from_id(unsigned short id) 523762306a36Sopenharmony_ci{ 523862306a36Sopenharmony_ci WARN_ON_ONCE(!rcu_read_lock_held()); 523962306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 524062306a36Sopenharmony_ci if (id == -1) 524162306a36Sopenharmony_ci return NULL; 524262306a36Sopenharmony_ci#endif 524362306a36Sopenharmony_ci return idr_find(&mem_cgroup_idr, id); 524462306a36Sopenharmony_ci} 524562306a36Sopenharmony_ci 524662306a36Sopenharmony_ci#ifdef CONFIG_SHRINKER_DEBUG 524762306a36Sopenharmony_cistruct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) 524862306a36Sopenharmony_ci{ 524962306a36Sopenharmony_ci struct cgroup *cgrp; 525062306a36Sopenharmony_ci struct cgroup_subsys_state *css; 525162306a36Sopenharmony_ci struct mem_cgroup *memcg; 525262306a36Sopenharmony_ci 525362306a36Sopenharmony_ci cgrp = cgroup_get_from_id(ino); 525462306a36Sopenharmony_ci if (IS_ERR(cgrp)) 525562306a36Sopenharmony_ci return ERR_CAST(cgrp); 525662306a36Sopenharmony_ci 525762306a36Sopenharmony_ci css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); 525862306a36Sopenharmony_ci if (css) 525962306a36Sopenharmony_ci memcg = container_of(css, struct mem_cgroup, css); 526062306a36Sopenharmony_ci else 526162306a36Sopenharmony_ci memcg = ERR_PTR(-ENOENT); 526262306a36Sopenharmony_ci 526362306a36Sopenharmony_ci cgroup_put(cgrp); 526462306a36Sopenharmony_ci 526562306a36Sopenharmony_ci return memcg; 526662306a36Sopenharmony_ci} 526762306a36Sopenharmony_ci#endif 526862306a36Sopenharmony_ci 526962306a36Sopenharmony_cistatic int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 527062306a36Sopenharmony_ci{ 527162306a36Sopenharmony_ci struct mem_cgroup_per_node *pn; 527262306a36Sopenharmony_ci 527362306a36Sopenharmony_ci pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); 527462306a36Sopenharmony_ci if (!pn) 527562306a36Sopenharmony_ci return 1; 527662306a36Sopenharmony_ci 527762306a36Sopenharmony_ci pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, 527862306a36Sopenharmony_ci GFP_KERNEL_ACCOUNT); 527962306a36Sopenharmony_ci if (!pn->lruvec_stats_percpu) { 528062306a36Sopenharmony_ci kfree(pn); 528162306a36Sopenharmony_ci return 1; 528262306a36Sopenharmony_ci } 528362306a36Sopenharmony_ci 528462306a36Sopenharmony_ci lruvec_init(&pn->lruvec); 528562306a36Sopenharmony_ci#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) 528662306a36Sopenharmony_ci pn->lruvec.pgdat = NODE_DATA(node); 528762306a36Sopenharmony_ci#endif 528862306a36Sopenharmony_ci pn->memcg = memcg; 528962306a36Sopenharmony_ci 529062306a36Sopenharmony_ci memcg->nodeinfo[node] = pn; 529162306a36Sopenharmony_ci return 0; 529262306a36Sopenharmony_ci} 529362306a36Sopenharmony_ci 529462306a36Sopenharmony_cistatic void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 529562306a36Sopenharmony_ci{ 529662306a36Sopenharmony_ci struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 529762306a36Sopenharmony_ci 529862306a36Sopenharmony_ci if (!pn) 529962306a36Sopenharmony_ci return; 530062306a36Sopenharmony_ci 530162306a36Sopenharmony_ci free_percpu(pn->lruvec_stats_percpu); 530262306a36Sopenharmony_ci kfree(pn); 530362306a36Sopenharmony_ci} 530462306a36Sopenharmony_ci 530562306a36Sopenharmony_cistatic void __mem_cgroup_free(struct mem_cgroup *memcg) 530662306a36Sopenharmony_ci{ 530762306a36Sopenharmony_ci int node; 530862306a36Sopenharmony_ci 530962306a36Sopenharmony_ci for_each_node(node) 531062306a36Sopenharmony_ci free_mem_cgroup_per_node_info(memcg, node); 531162306a36Sopenharmony_ci kfree(memcg->vmstats); 531262306a36Sopenharmony_ci free_percpu(memcg->vmstats_percpu); 531362306a36Sopenharmony_ci kfree(memcg); 531462306a36Sopenharmony_ci} 531562306a36Sopenharmony_ci 531662306a36Sopenharmony_cistatic void mem_cgroup_free(struct mem_cgroup *memcg) 531762306a36Sopenharmony_ci{ 531862306a36Sopenharmony_ci lru_gen_exit_memcg(memcg); 531962306a36Sopenharmony_ci memcg_wb_domain_exit(memcg); 532062306a36Sopenharmony_ci __mem_cgroup_free(memcg); 532162306a36Sopenharmony_ci} 532262306a36Sopenharmony_ci 532362306a36Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_alloc(void) 532462306a36Sopenharmony_ci{ 532562306a36Sopenharmony_ci struct mem_cgroup *memcg; 532662306a36Sopenharmony_ci int node; 532762306a36Sopenharmony_ci int __maybe_unused i; 532862306a36Sopenharmony_ci long error = -ENOMEM; 532962306a36Sopenharmony_ci 533062306a36Sopenharmony_ci memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); 533162306a36Sopenharmony_ci if (!memcg) 533262306a36Sopenharmony_ci return ERR_PTR(error); 533362306a36Sopenharmony_ci 533462306a36Sopenharmony_ci memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 533562306a36Sopenharmony_ci 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); 533662306a36Sopenharmony_ci if (memcg->id.id < 0) { 533762306a36Sopenharmony_ci error = memcg->id.id; 533862306a36Sopenharmony_ci goto fail; 533962306a36Sopenharmony_ci } 534062306a36Sopenharmony_ci 534162306a36Sopenharmony_ci memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); 534262306a36Sopenharmony_ci if (!memcg->vmstats) 534362306a36Sopenharmony_ci goto fail; 534462306a36Sopenharmony_ci 534562306a36Sopenharmony_ci memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 534662306a36Sopenharmony_ci GFP_KERNEL_ACCOUNT); 534762306a36Sopenharmony_ci if (!memcg->vmstats_percpu) 534862306a36Sopenharmony_ci goto fail; 534962306a36Sopenharmony_ci 535062306a36Sopenharmony_ci for_each_node(node) 535162306a36Sopenharmony_ci if (alloc_mem_cgroup_per_node_info(memcg, node)) 535262306a36Sopenharmony_ci goto fail; 535362306a36Sopenharmony_ci 535462306a36Sopenharmony_ci if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 535562306a36Sopenharmony_ci goto fail; 535662306a36Sopenharmony_ci 535762306a36Sopenharmony_ci INIT_WORK(&memcg->high_work, high_work_func); 535862306a36Sopenharmony_ci INIT_LIST_HEAD(&memcg->oom_notify); 535962306a36Sopenharmony_ci mutex_init(&memcg->thresholds_lock); 536062306a36Sopenharmony_ci spin_lock_init(&memcg->move_lock); 536162306a36Sopenharmony_ci vmpressure_init(&memcg->vmpressure); 536262306a36Sopenharmony_ci INIT_LIST_HEAD(&memcg->event_list); 536362306a36Sopenharmony_ci spin_lock_init(&memcg->event_list_lock); 536462306a36Sopenharmony_ci memcg->socket_pressure = jiffies; 536562306a36Sopenharmony_ci#ifdef CONFIG_MEMCG_KMEM 536662306a36Sopenharmony_ci memcg->kmemcg_id = -1; 536762306a36Sopenharmony_ci INIT_LIST_HEAD(&memcg->objcg_list); 536862306a36Sopenharmony_ci#endif 536962306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 537062306a36Sopenharmony_ci INIT_LIST_HEAD(&memcg->cgwb_list); 537162306a36Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 537262306a36Sopenharmony_ci memcg->cgwb_frn[i].done = 537362306a36Sopenharmony_ci __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 537462306a36Sopenharmony_ci#endif 537562306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 537662306a36Sopenharmony_ci spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 537762306a36Sopenharmony_ci INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 537862306a36Sopenharmony_ci memcg->deferred_split_queue.split_queue_len = 0; 537962306a36Sopenharmony_ci#endif 538062306a36Sopenharmony_ci 538162306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 538262306a36Sopenharmony_ci if (unlikely(!score_head_inited)) { 538362306a36Sopenharmony_ci INIT_LIST_HEAD(&score_head); 538462306a36Sopenharmony_ci score_head_inited = true; 538562306a36Sopenharmony_ci } 538662306a36Sopenharmony_ci#endif 538762306a36Sopenharmony_ci 538862306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 538962306a36Sopenharmony_ci INIT_LIST_HEAD(&memcg->score_node); 539062306a36Sopenharmony_ci#endif 539162306a36Sopenharmony_ci 539262306a36Sopenharmony_ci lru_gen_init_memcg(memcg); 539362306a36Sopenharmony_ci return memcg; 539462306a36Sopenharmony_cifail: 539562306a36Sopenharmony_ci mem_cgroup_id_remove(memcg); 539662306a36Sopenharmony_ci __mem_cgroup_free(memcg); 539762306a36Sopenharmony_ci return ERR_PTR(error); 539862306a36Sopenharmony_ci} 539962306a36Sopenharmony_ci 540062306a36Sopenharmony_cistatic struct cgroup_subsys_state * __ref 540162306a36Sopenharmony_cimem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 540262306a36Sopenharmony_ci{ 540362306a36Sopenharmony_ci struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 540462306a36Sopenharmony_ci struct mem_cgroup *memcg, *old_memcg; 540562306a36Sopenharmony_ci 540662306a36Sopenharmony_ci old_memcg = set_active_memcg(parent); 540762306a36Sopenharmony_ci memcg = mem_cgroup_alloc(); 540862306a36Sopenharmony_ci set_active_memcg(old_memcg); 540962306a36Sopenharmony_ci if (IS_ERR(memcg)) 541062306a36Sopenharmony_ci return ERR_CAST(memcg); 541162306a36Sopenharmony_ci 541262306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 541362306a36Sopenharmony_ci atomic64_set(&memcg->memcg_reclaimed.app_score, 300); 541462306a36Sopenharmony_ci#endif 541562306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD 541662306a36Sopenharmony_ci atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10); 541762306a36Sopenharmony_ci atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60); 541862306a36Sopenharmony_ci atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50); 541962306a36Sopenharmony_ci#endif 542062306a36Sopenharmony_ci page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 542162306a36Sopenharmony_ci WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); 542262306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 542362306a36Sopenharmony_ci memcg->zswap_max = PAGE_COUNTER_MAX; 542462306a36Sopenharmony_ci#endif 542562306a36Sopenharmony_ci page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 542662306a36Sopenharmony_ci if (parent) { 542762306a36Sopenharmony_ci WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); 542862306a36Sopenharmony_ci WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); 542962306a36Sopenharmony_ci 543062306a36Sopenharmony_ci page_counter_init(&memcg->memory, &parent->memory); 543162306a36Sopenharmony_ci page_counter_init(&memcg->swap, &parent->swap); 543262306a36Sopenharmony_ci page_counter_init(&memcg->kmem, &parent->kmem); 543362306a36Sopenharmony_ci page_counter_init(&memcg->tcpmem, &parent->tcpmem); 543462306a36Sopenharmony_ci } else { 543562306a36Sopenharmony_ci init_memcg_events(); 543662306a36Sopenharmony_ci page_counter_init(&memcg->memory, NULL); 543762306a36Sopenharmony_ci page_counter_init(&memcg->swap, NULL); 543862306a36Sopenharmony_ci page_counter_init(&memcg->kmem, NULL); 543962306a36Sopenharmony_ci page_counter_init(&memcg->tcpmem, NULL); 544062306a36Sopenharmony_ci 544162306a36Sopenharmony_ci root_mem_cgroup = memcg; 544262306a36Sopenharmony_ci return &memcg->css; 544362306a36Sopenharmony_ci } 544462306a36Sopenharmony_ci 544562306a36Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 544662306a36Sopenharmony_ci static_branch_inc(&memcg_sockets_enabled_key); 544762306a36Sopenharmony_ci 544862306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) 544962306a36Sopenharmony_ci if (!cgroup_memory_nobpf) 545062306a36Sopenharmony_ci static_branch_inc(&memcg_bpf_enabled_key); 545162306a36Sopenharmony_ci#endif 545262306a36Sopenharmony_ci 545362306a36Sopenharmony_ci return &memcg->css; 545462306a36Sopenharmony_ci} 545562306a36Sopenharmony_ci 545662306a36Sopenharmony_cistatic int mem_cgroup_css_online(struct cgroup_subsys_state *css) 545762306a36Sopenharmony_ci{ 545862306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 545962306a36Sopenharmony_ci 546062306a36Sopenharmony_ci if (memcg_online_kmem(memcg)) 546162306a36Sopenharmony_ci goto remove_id; 546262306a36Sopenharmony_ci 546362306a36Sopenharmony_ci /* 546462306a36Sopenharmony_ci * A memcg must be visible for expand_shrinker_info() 546562306a36Sopenharmony_ci * by the time the maps are allocated. So, we allocate maps 546662306a36Sopenharmony_ci * here, when for_each_mem_cgroup() can't skip it. 546762306a36Sopenharmony_ci */ 546862306a36Sopenharmony_ci if (alloc_shrinker_info(memcg)) 546962306a36Sopenharmony_ci goto offline_kmem; 547062306a36Sopenharmony_ci 547162306a36Sopenharmony_ci if (unlikely(mem_cgroup_is_root(memcg))) 547262306a36Sopenharmony_ci queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 547362306a36Sopenharmony_ci FLUSH_TIME); 547462306a36Sopenharmony_ci lru_gen_online_memcg(memcg); 547562306a36Sopenharmony_ci 547662306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 547762306a36Sopenharmony_ci memcg_app_score_update(memcg); 547862306a36Sopenharmony_ci css_get(css); 547962306a36Sopenharmony_ci#endif 548062306a36Sopenharmony_ci 548162306a36Sopenharmony_ci /* Online state pins memcg ID, memcg ID pins CSS */ 548262306a36Sopenharmony_ci refcount_set(&memcg->id.ref, 1); 548362306a36Sopenharmony_ci css_get(css); 548462306a36Sopenharmony_ci 548562306a36Sopenharmony_ci /* 548662306a36Sopenharmony_ci * Ensure mem_cgroup_from_id() works once we're fully online. 548762306a36Sopenharmony_ci * 548862306a36Sopenharmony_ci * We could do this earlier and require callers to filter with 548962306a36Sopenharmony_ci * css_tryget_online(). But right now there are no users that 549062306a36Sopenharmony_ci * need earlier access, and the workingset code relies on the 549162306a36Sopenharmony_ci * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So 549262306a36Sopenharmony_ci * publish it here at the end of onlining. This matches the 549362306a36Sopenharmony_ci * regular ID destruction during offlining. 549462306a36Sopenharmony_ci */ 549562306a36Sopenharmony_ci idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 549662306a36Sopenharmony_ci 549762306a36Sopenharmony_ci return 0; 549862306a36Sopenharmony_cioffline_kmem: 549962306a36Sopenharmony_ci memcg_offline_kmem(memcg); 550062306a36Sopenharmony_ciremove_id: 550162306a36Sopenharmony_ci mem_cgroup_id_remove(memcg); 550262306a36Sopenharmony_ci return -ENOMEM; 550362306a36Sopenharmony_ci} 550462306a36Sopenharmony_ci 550562306a36Sopenharmony_cistatic void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 550662306a36Sopenharmony_ci{ 550762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 550862306a36Sopenharmony_ci struct mem_cgroup_event *event, *tmp; 550962306a36Sopenharmony_ci 551062306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_MEMCG 551162306a36Sopenharmony_ci unsigned long flags; 551262306a36Sopenharmony_ci 551362306a36Sopenharmony_ci write_lock_irqsave(&score_list_lock, flags); 551462306a36Sopenharmony_ci list_del_init(&memcg->score_node); 551562306a36Sopenharmony_ci write_unlock_irqrestore(&score_list_lock, flags); 551662306a36Sopenharmony_ci css_put(css); 551762306a36Sopenharmony_ci#endif 551862306a36Sopenharmony_ci 551962306a36Sopenharmony_ci /* 552062306a36Sopenharmony_ci * Unregister events and notify userspace. 552162306a36Sopenharmony_ci * Notify userspace about cgroup removing only after rmdir of cgroup 552262306a36Sopenharmony_ci * directory to avoid race between userspace and kernelspace. 552362306a36Sopenharmony_ci */ 552462306a36Sopenharmony_ci spin_lock_irq(&memcg->event_list_lock); 552562306a36Sopenharmony_ci list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 552662306a36Sopenharmony_ci list_del_init(&event->list); 552762306a36Sopenharmony_ci schedule_work(&event->remove); 552862306a36Sopenharmony_ci } 552962306a36Sopenharmony_ci spin_unlock_irq(&memcg->event_list_lock); 553062306a36Sopenharmony_ci 553162306a36Sopenharmony_ci page_counter_set_min(&memcg->memory, 0); 553262306a36Sopenharmony_ci page_counter_set_low(&memcg->memory, 0); 553362306a36Sopenharmony_ci 553462306a36Sopenharmony_ci memcg_offline_kmem(memcg); 553562306a36Sopenharmony_ci reparent_shrinker_deferred(memcg); 553662306a36Sopenharmony_ci wb_memcg_offline(memcg); 553762306a36Sopenharmony_ci lru_gen_offline_memcg(memcg); 553862306a36Sopenharmony_ci 553962306a36Sopenharmony_ci drain_all_stock(memcg); 554062306a36Sopenharmony_ci 554162306a36Sopenharmony_ci mem_cgroup_id_put(memcg); 554262306a36Sopenharmony_ci} 554362306a36Sopenharmony_ci 554462306a36Sopenharmony_cistatic void mem_cgroup_css_released(struct cgroup_subsys_state *css) 554562306a36Sopenharmony_ci{ 554662306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 554762306a36Sopenharmony_ci 554862306a36Sopenharmony_ci invalidate_reclaim_iterators(memcg); 554962306a36Sopenharmony_ci lru_gen_release_memcg(memcg); 555062306a36Sopenharmony_ci} 555162306a36Sopenharmony_ci 555262306a36Sopenharmony_cistatic void mem_cgroup_css_free(struct cgroup_subsys_state *css) 555362306a36Sopenharmony_ci{ 555462306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 555562306a36Sopenharmony_ci int __maybe_unused i; 555662306a36Sopenharmony_ci 555762306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 555862306a36Sopenharmony_ci for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 555962306a36Sopenharmony_ci wb_wait_for_completion(&memcg->cgwb_frn[i].done); 556062306a36Sopenharmony_ci#endif 556162306a36Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 556262306a36Sopenharmony_ci static_branch_dec(&memcg_sockets_enabled_key); 556362306a36Sopenharmony_ci 556462306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 556562306a36Sopenharmony_ci static_branch_dec(&memcg_sockets_enabled_key); 556662306a36Sopenharmony_ci 556762306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) 556862306a36Sopenharmony_ci if (!cgroup_memory_nobpf) 556962306a36Sopenharmony_ci static_branch_dec(&memcg_bpf_enabled_key); 557062306a36Sopenharmony_ci#endif 557162306a36Sopenharmony_ci 557262306a36Sopenharmony_ci vmpressure_cleanup(&memcg->vmpressure); 557362306a36Sopenharmony_ci cancel_work_sync(&memcg->high_work); 557462306a36Sopenharmony_ci mem_cgroup_remove_from_trees(memcg); 557562306a36Sopenharmony_ci free_shrinker_info(memcg); 557662306a36Sopenharmony_ci mem_cgroup_free(memcg); 557762306a36Sopenharmony_ci} 557862306a36Sopenharmony_ci 557962306a36Sopenharmony_ci/** 558062306a36Sopenharmony_ci * mem_cgroup_css_reset - reset the states of a mem_cgroup 558162306a36Sopenharmony_ci * @css: the target css 558262306a36Sopenharmony_ci * 558362306a36Sopenharmony_ci * Reset the states of the mem_cgroup associated with @css. This is 558462306a36Sopenharmony_ci * invoked when the userland requests disabling on the default hierarchy 558562306a36Sopenharmony_ci * but the memcg is pinned through dependency. The memcg should stop 558662306a36Sopenharmony_ci * applying policies and should revert to the vanilla state as it may be 558762306a36Sopenharmony_ci * made visible again. 558862306a36Sopenharmony_ci * 558962306a36Sopenharmony_ci * The current implementation only resets the essential configurations. 559062306a36Sopenharmony_ci * This needs to be expanded to cover all the visible parts. 559162306a36Sopenharmony_ci */ 559262306a36Sopenharmony_cistatic void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 559362306a36Sopenharmony_ci{ 559462306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 559562306a36Sopenharmony_ci 559662306a36Sopenharmony_ci page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 559762306a36Sopenharmony_ci page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 559862306a36Sopenharmony_ci page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 559962306a36Sopenharmony_ci page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 560062306a36Sopenharmony_ci page_counter_set_min(&memcg->memory, 0); 560162306a36Sopenharmony_ci page_counter_set_low(&memcg->memory, 0); 560262306a36Sopenharmony_ci page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 560362306a36Sopenharmony_ci WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); 560462306a36Sopenharmony_ci page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 560562306a36Sopenharmony_ci memcg_wb_domain_size_changed(memcg); 560662306a36Sopenharmony_ci} 560762306a36Sopenharmony_ci 560862306a36Sopenharmony_cistatic void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 560962306a36Sopenharmony_ci{ 561062306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 561162306a36Sopenharmony_ci struct mem_cgroup *parent = parent_mem_cgroup(memcg); 561262306a36Sopenharmony_ci struct memcg_vmstats_percpu *statc; 561362306a36Sopenharmony_ci long delta, delta_cpu, v; 561462306a36Sopenharmony_ci int i, nid; 561562306a36Sopenharmony_ci 561662306a36Sopenharmony_ci statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 561762306a36Sopenharmony_ci 561862306a36Sopenharmony_ci for (i = 0; i < MEMCG_NR_STAT; i++) { 561962306a36Sopenharmony_ci /* 562062306a36Sopenharmony_ci * Collect the aggregated propagation counts of groups 562162306a36Sopenharmony_ci * below us. We're in a per-cpu loop here and this is 562262306a36Sopenharmony_ci * a global counter, so the first cycle will get them. 562362306a36Sopenharmony_ci */ 562462306a36Sopenharmony_ci delta = memcg->vmstats->state_pending[i]; 562562306a36Sopenharmony_ci if (delta) 562662306a36Sopenharmony_ci memcg->vmstats->state_pending[i] = 0; 562762306a36Sopenharmony_ci 562862306a36Sopenharmony_ci /* Add CPU changes on this level since the last flush */ 562962306a36Sopenharmony_ci delta_cpu = 0; 563062306a36Sopenharmony_ci v = READ_ONCE(statc->state[i]); 563162306a36Sopenharmony_ci if (v != statc->state_prev[i]) { 563262306a36Sopenharmony_ci delta_cpu = v - statc->state_prev[i]; 563362306a36Sopenharmony_ci delta += delta_cpu; 563462306a36Sopenharmony_ci statc->state_prev[i] = v; 563562306a36Sopenharmony_ci } 563662306a36Sopenharmony_ci 563762306a36Sopenharmony_ci /* Aggregate counts on this level and propagate upwards */ 563862306a36Sopenharmony_ci if (delta_cpu) 563962306a36Sopenharmony_ci memcg->vmstats->state_local[i] += delta_cpu; 564062306a36Sopenharmony_ci 564162306a36Sopenharmony_ci if (delta) { 564262306a36Sopenharmony_ci memcg->vmstats->state[i] += delta; 564362306a36Sopenharmony_ci if (parent) 564462306a36Sopenharmony_ci parent->vmstats->state_pending[i] += delta; 564562306a36Sopenharmony_ci } 564662306a36Sopenharmony_ci } 564762306a36Sopenharmony_ci 564862306a36Sopenharmony_ci for (i = 0; i < NR_MEMCG_EVENTS; i++) { 564962306a36Sopenharmony_ci delta = memcg->vmstats->events_pending[i]; 565062306a36Sopenharmony_ci if (delta) 565162306a36Sopenharmony_ci memcg->vmstats->events_pending[i] = 0; 565262306a36Sopenharmony_ci 565362306a36Sopenharmony_ci delta_cpu = 0; 565462306a36Sopenharmony_ci v = READ_ONCE(statc->events[i]); 565562306a36Sopenharmony_ci if (v != statc->events_prev[i]) { 565662306a36Sopenharmony_ci delta_cpu = v - statc->events_prev[i]; 565762306a36Sopenharmony_ci delta += delta_cpu; 565862306a36Sopenharmony_ci statc->events_prev[i] = v; 565962306a36Sopenharmony_ci } 566062306a36Sopenharmony_ci 566162306a36Sopenharmony_ci if (delta_cpu) 566262306a36Sopenharmony_ci memcg->vmstats->events_local[i] += delta_cpu; 566362306a36Sopenharmony_ci 566462306a36Sopenharmony_ci if (delta) { 566562306a36Sopenharmony_ci memcg->vmstats->events[i] += delta; 566662306a36Sopenharmony_ci if (parent) 566762306a36Sopenharmony_ci parent->vmstats->events_pending[i] += delta; 566862306a36Sopenharmony_ci } 566962306a36Sopenharmony_ci } 567062306a36Sopenharmony_ci 567162306a36Sopenharmony_ci for_each_node_state(nid, N_MEMORY) { 567262306a36Sopenharmony_ci struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 567362306a36Sopenharmony_ci struct mem_cgroup_per_node *ppn = NULL; 567462306a36Sopenharmony_ci struct lruvec_stats_percpu *lstatc; 567562306a36Sopenharmony_ci 567662306a36Sopenharmony_ci if (parent) 567762306a36Sopenharmony_ci ppn = parent->nodeinfo[nid]; 567862306a36Sopenharmony_ci 567962306a36Sopenharmony_ci lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); 568062306a36Sopenharmony_ci 568162306a36Sopenharmony_ci for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 568262306a36Sopenharmony_ci delta = pn->lruvec_stats.state_pending[i]; 568362306a36Sopenharmony_ci if (delta) 568462306a36Sopenharmony_ci pn->lruvec_stats.state_pending[i] = 0; 568562306a36Sopenharmony_ci 568662306a36Sopenharmony_ci delta_cpu = 0; 568762306a36Sopenharmony_ci v = READ_ONCE(lstatc->state[i]); 568862306a36Sopenharmony_ci if (v != lstatc->state_prev[i]) { 568962306a36Sopenharmony_ci delta_cpu = v - lstatc->state_prev[i]; 569062306a36Sopenharmony_ci delta += delta_cpu; 569162306a36Sopenharmony_ci lstatc->state_prev[i] = v; 569262306a36Sopenharmony_ci } 569362306a36Sopenharmony_ci 569462306a36Sopenharmony_ci if (delta_cpu) 569562306a36Sopenharmony_ci pn->lruvec_stats.state_local[i] += delta_cpu; 569662306a36Sopenharmony_ci 569762306a36Sopenharmony_ci if (delta) { 569862306a36Sopenharmony_ci pn->lruvec_stats.state[i] += delta; 569962306a36Sopenharmony_ci if (ppn) 570062306a36Sopenharmony_ci ppn->lruvec_stats.state_pending[i] += delta; 570162306a36Sopenharmony_ci } 570262306a36Sopenharmony_ci } 570362306a36Sopenharmony_ci } 570462306a36Sopenharmony_ci} 570562306a36Sopenharmony_ci 570662306a36Sopenharmony_ci#ifdef CONFIG_MMU 570762306a36Sopenharmony_ci/* Handlers for move charge at task migration. */ 570862306a36Sopenharmony_cistatic int mem_cgroup_do_precharge(unsigned long count) 570962306a36Sopenharmony_ci{ 571062306a36Sopenharmony_ci int ret; 571162306a36Sopenharmony_ci 571262306a36Sopenharmony_ci /* Try a single bulk charge without reclaim first, kswapd may wake */ 571362306a36Sopenharmony_ci ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 571462306a36Sopenharmony_ci if (!ret) { 571562306a36Sopenharmony_ci mc.precharge += count; 571662306a36Sopenharmony_ci return ret; 571762306a36Sopenharmony_ci } 571862306a36Sopenharmony_ci 571962306a36Sopenharmony_ci /* Try charges one by one with reclaim, but do not retry */ 572062306a36Sopenharmony_ci while (count--) { 572162306a36Sopenharmony_ci ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 572262306a36Sopenharmony_ci if (ret) 572362306a36Sopenharmony_ci return ret; 572462306a36Sopenharmony_ci mc.precharge++; 572562306a36Sopenharmony_ci cond_resched(); 572662306a36Sopenharmony_ci } 572762306a36Sopenharmony_ci return 0; 572862306a36Sopenharmony_ci} 572962306a36Sopenharmony_ci 573062306a36Sopenharmony_ciunion mc_target { 573162306a36Sopenharmony_ci struct page *page; 573262306a36Sopenharmony_ci swp_entry_t ent; 573362306a36Sopenharmony_ci}; 573462306a36Sopenharmony_ci 573562306a36Sopenharmony_cienum mc_target_type { 573662306a36Sopenharmony_ci MC_TARGET_NONE = 0, 573762306a36Sopenharmony_ci MC_TARGET_PAGE, 573862306a36Sopenharmony_ci MC_TARGET_SWAP, 573962306a36Sopenharmony_ci MC_TARGET_DEVICE, 574062306a36Sopenharmony_ci}; 574162306a36Sopenharmony_ci 574262306a36Sopenharmony_cistatic struct page *mc_handle_present_pte(struct vm_area_struct *vma, 574362306a36Sopenharmony_ci unsigned long addr, pte_t ptent) 574462306a36Sopenharmony_ci{ 574562306a36Sopenharmony_ci struct page *page = vm_normal_page(vma, addr, ptent); 574662306a36Sopenharmony_ci 574762306a36Sopenharmony_ci if (!page) 574862306a36Sopenharmony_ci return NULL; 574962306a36Sopenharmony_ci if (PageAnon(page)) { 575062306a36Sopenharmony_ci if (!(mc.flags & MOVE_ANON)) 575162306a36Sopenharmony_ci return NULL; 575262306a36Sopenharmony_ci } else { 575362306a36Sopenharmony_ci if (!(mc.flags & MOVE_FILE)) 575462306a36Sopenharmony_ci return NULL; 575562306a36Sopenharmony_ci } 575662306a36Sopenharmony_ci get_page(page); 575762306a36Sopenharmony_ci 575862306a36Sopenharmony_ci return page; 575962306a36Sopenharmony_ci} 576062306a36Sopenharmony_ci 576162306a36Sopenharmony_ci#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 576262306a36Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 576362306a36Sopenharmony_ci pte_t ptent, swp_entry_t *entry) 576462306a36Sopenharmony_ci{ 576562306a36Sopenharmony_ci struct page *page = NULL; 576662306a36Sopenharmony_ci swp_entry_t ent = pte_to_swp_entry(ptent); 576762306a36Sopenharmony_ci 576862306a36Sopenharmony_ci if (!(mc.flags & MOVE_ANON)) 576962306a36Sopenharmony_ci return NULL; 577062306a36Sopenharmony_ci 577162306a36Sopenharmony_ci /* 577262306a36Sopenharmony_ci * Handle device private pages that are not accessible by the CPU, but 577362306a36Sopenharmony_ci * stored as special swap entries in the page table. 577462306a36Sopenharmony_ci */ 577562306a36Sopenharmony_ci if (is_device_private_entry(ent)) { 577662306a36Sopenharmony_ci page = pfn_swap_entry_to_page(ent); 577762306a36Sopenharmony_ci if (!get_page_unless_zero(page)) 577862306a36Sopenharmony_ci return NULL; 577962306a36Sopenharmony_ci return page; 578062306a36Sopenharmony_ci } 578162306a36Sopenharmony_ci 578262306a36Sopenharmony_ci if (non_swap_entry(ent)) 578362306a36Sopenharmony_ci return NULL; 578462306a36Sopenharmony_ci 578562306a36Sopenharmony_ci /* 578662306a36Sopenharmony_ci * Because swap_cache_get_folio() updates some statistics counter, 578762306a36Sopenharmony_ci * we call find_get_page() with swapper_space directly. 578862306a36Sopenharmony_ci */ 578962306a36Sopenharmony_ci page = find_get_page(swap_address_space(ent), swp_offset(ent)); 579062306a36Sopenharmony_ci entry->val = ent.val; 579162306a36Sopenharmony_ci 579262306a36Sopenharmony_ci return page; 579362306a36Sopenharmony_ci} 579462306a36Sopenharmony_ci#else 579562306a36Sopenharmony_cistatic struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 579662306a36Sopenharmony_ci pte_t ptent, swp_entry_t *entry) 579762306a36Sopenharmony_ci{ 579862306a36Sopenharmony_ci return NULL; 579962306a36Sopenharmony_ci} 580062306a36Sopenharmony_ci#endif 580162306a36Sopenharmony_ci 580262306a36Sopenharmony_cistatic struct page *mc_handle_file_pte(struct vm_area_struct *vma, 580362306a36Sopenharmony_ci unsigned long addr, pte_t ptent) 580462306a36Sopenharmony_ci{ 580562306a36Sopenharmony_ci unsigned long index; 580662306a36Sopenharmony_ci struct folio *folio; 580762306a36Sopenharmony_ci 580862306a36Sopenharmony_ci if (!vma->vm_file) /* anonymous vma */ 580962306a36Sopenharmony_ci return NULL; 581062306a36Sopenharmony_ci if (!(mc.flags & MOVE_FILE)) 581162306a36Sopenharmony_ci return NULL; 581262306a36Sopenharmony_ci 581362306a36Sopenharmony_ci /* folio is moved even if it's not RSS of this task(page-faulted). */ 581462306a36Sopenharmony_ci /* shmem/tmpfs may report page out on swap: account for that too. */ 581562306a36Sopenharmony_ci index = linear_page_index(vma, addr); 581662306a36Sopenharmony_ci folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); 581762306a36Sopenharmony_ci if (IS_ERR(folio)) 581862306a36Sopenharmony_ci return NULL; 581962306a36Sopenharmony_ci return folio_file_page(folio, index); 582062306a36Sopenharmony_ci} 582162306a36Sopenharmony_ci 582262306a36Sopenharmony_ci/** 582362306a36Sopenharmony_ci * mem_cgroup_move_account - move account of the page 582462306a36Sopenharmony_ci * @page: the page 582562306a36Sopenharmony_ci * @compound: charge the page as compound or small page 582662306a36Sopenharmony_ci * @from: mem_cgroup which the page is moved from. 582762306a36Sopenharmony_ci * @to: mem_cgroup which the page is moved to. @from != @to. 582862306a36Sopenharmony_ci * 582962306a36Sopenharmony_ci * The page must be locked and not on the LRU. 583062306a36Sopenharmony_ci * 583162306a36Sopenharmony_ci * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 583262306a36Sopenharmony_ci * from old cgroup. 583362306a36Sopenharmony_ci */ 583462306a36Sopenharmony_cistatic int mem_cgroup_move_account(struct page *page, 583562306a36Sopenharmony_ci bool compound, 583662306a36Sopenharmony_ci struct mem_cgroup *from, 583762306a36Sopenharmony_ci struct mem_cgroup *to) 583862306a36Sopenharmony_ci{ 583962306a36Sopenharmony_ci struct folio *folio = page_folio(page); 584062306a36Sopenharmony_ci struct lruvec *from_vec, *to_vec; 584162306a36Sopenharmony_ci struct pglist_data *pgdat; 584262306a36Sopenharmony_ci unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; 584362306a36Sopenharmony_ci int nid, ret; 584462306a36Sopenharmony_ci 584562306a36Sopenharmony_ci VM_BUG_ON(from == to); 584662306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 584762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 584862306a36Sopenharmony_ci VM_BUG_ON(compound && !folio_test_large(folio)); 584962306a36Sopenharmony_ci 585062306a36Sopenharmony_ci ret = -EINVAL; 585162306a36Sopenharmony_ci if (folio_memcg(folio) != from) 585262306a36Sopenharmony_ci goto out; 585362306a36Sopenharmony_ci 585462306a36Sopenharmony_ci pgdat = folio_pgdat(folio); 585562306a36Sopenharmony_ci from_vec = mem_cgroup_lruvec(from, pgdat); 585662306a36Sopenharmony_ci to_vec = mem_cgroup_lruvec(to, pgdat); 585762306a36Sopenharmony_ci 585862306a36Sopenharmony_ci folio_memcg_lock(folio); 585962306a36Sopenharmony_ci 586062306a36Sopenharmony_ci if (folio_test_anon(folio)) { 586162306a36Sopenharmony_ci if (folio_mapped(folio)) { 586262306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 586362306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 586462306a36Sopenharmony_ci if (folio_test_pmd_mappable(folio)) { 586562306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_ANON_THPS, 586662306a36Sopenharmony_ci -nr_pages); 586762306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_ANON_THPS, 586862306a36Sopenharmony_ci nr_pages); 586962306a36Sopenharmony_ci } 587062306a36Sopenharmony_ci } 587162306a36Sopenharmony_ci } else { 587262306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 587362306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 587462306a36Sopenharmony_ci 587562306a36Sopenharmony_ci if (folio_test_swapbacked(folio)) { 587662306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 587762306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 587862306a36Sopenharmony_ci } 587962306a36Sopenharmony_ci 588062306a36Sopenharmony_ci if (folio_mapped(folio)) { 588162306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 588262306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 588362306a36Sopenharmony_ci } 588462306a36Sopenharmony_ci 588562306a36Sopenharmony_ci if (folio_test_dirty(folio)) { 588662306a36Sopenharmony_ci struct address_space *mapping = folio_mapping(folio); 588762306a36Sopenharmony_ci 588862306a36Sopenharmony_ci if (mapping_can_writeback(mapping)) { 588962306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 589062306a36Sopenharmony_ci -nr_pages); 589162306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 589262306a36Sopenharmony_ci nr_pages); 589362306a36Sopenharmony_ci } 589462306a36Sopenharmony_ci } 589562306a36Sopenharmony_ci } 589662306a36Sopenharmony_ci 589762306a36Sopenharmony_ci#ifdef CONFIG_SWAP 589862306a36Sopenharmony_ci if (folio_test_swapcache(folio)) { 589962306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); 590062306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); 590162306a36Sopenharmony_ci } 590262306a36Sopenharmony_ci#endif 590362306a36Sopenharmony_ci if (folio_test_writeback(folio)) { 590462306a36Sopenharmony_ci __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 590562306a36Sopenharmony_ci __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 590662306a36Sopenharmony_ci } 590762306a36Sopenharmony_ci 590862306a36Sopenharmony_ci /* 590962306a36Sopenharmony_ci * All state has been migrated, let's switch to the new memcg. 591062306a36Sopenharmony_ci * 591162306a36Sopenharmony_ci * It is safe to change page's memcg here because the page 591262306a36Sopenharmony_ci * is referenced, charged, isolated, and locked: we can't race 591362306a36Sopenharmony_ci * with (un)charging, migration, LRU putback, or anything else 591462306a36Sopenharmony_ci * that would rely on a stable page's memory cgroup. 591562306a36Sopenharmony_ci * 591662306a36Sopenharmony_ci * Note that folio_memcg_lock is a memcg lock, not a page lock, 591762306a36Sopenharmony_ci * to save space. As soon as we switch page's memory cgroup to a 591862306a36Sopenharmony_ci * new memcg that isn't locked, the above state can change 591962306a36Sopenharmony_ci * concurrently again. Make sure we're truly done with it. 592062306a36Sopenharmony_ci */ 592162306a36Sopenharmony_ci smp_mb(); 592262306a36Sopenharmony_ci 592362306a36Sopenharmony_ci css_get(&to->css); 592462306a36Sopenharmony_ci css_put(&from->css); 592562306a36Sopenharmony_ci 592662306a36Sopenharmony_ci folio->memcg_data = (unsigned long)to; 592762306a36Sopenharmony_ci 592862306a36Sopenharmony_ci __folio_memcg_unlock(from); 592962306a36Sopenharmony_ci 593062306a36Sopenharmony_ci ret = 0; 593162306a36Sopenharmony_ci nid = folio_nid(folio); 593262306a36Sopenharmony_ci 593362306a36Sopenharmony_ci local_irq_disable(); 593462306a36Sopenharmony_ci mem_cgroup_charge_statistics(to, nr_pages); 593562306a36Sopenharmony_ci memcg_check_events(to, nid); 593662306a36Sopenharmony_ci mem_cgroup_charge_statistics(from, -nr_pages); 593762306a36Sopenharmony_ci memcg_check_events(from, nid); 593862306a36Sopenharmony_ci local_irq_enable(); 593962306a36Sopenharmony_ciout: 594062306a36Sopenharmony_ci return ret; 594162306a36Sopenharmony_ci} 594262306a36Sopenharmony_ci 594362306a36Sopenharmony_ci/** 594462306a36Sopenharmony_ci * get_mctgt_type - get target type of moving charge 594562306a36Sopenharmony_ci * @vma: the vma the pte to be checked belongs 594662306a36Sopenharmony_ci * @addr: the address corresponding to the pte to be checked 594762306a36Sopenharmony_ci * @ptent: the pte to be checked 594862306a36Sopenharmony_ci * @target: the pointer the target page or swap ent will be stored(can be NULL) 594962306a36Sopenharmony_ci * 595062306a36Sopenharmony_ci * Context: Called with pte lock held. 595162306a36Sopenharmony_ci * Return: 595262306a36Sopenharmony_ci * * MC_TARGET_NONE - If the pte is not a target for move charge. 595362306a36Sopenharmony_ci * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for 595462306a36Sopenharmony_ci * move charge. If @target is not NULL, the page is stored in target->page 595562306a36Sopenharmony_ci * with extra refcnt taken (Caller should release it). 595662306a36Sopenharmony_ci * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a 595762306a36Sopenharmony_ci * target for charge migration. If @target is not NULL, the entry is 595862306a36Sopenharmony_ci * stored in target->ent. 595962306a36Sopenharmony_ci * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and 596062306a36Sopenharmony_ci * thus not on the lru. For now such page is charged like a regular page 596162306a36Sopenharmony_ci * would be as it is just special memory taking the place of a regular page. 596262306a36Sopenharmony_ci * See Documentations/vm/hmm.txt and include/linux/hmm.h 596362306a36Sopenharmony_ci */ 596462306a36Sopenharmony_cistatic enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 596562306a36Sopenharmony_ci unsigned long addr, pte_t ptent, union mc_target *target) 596662306a36Sopenharmony_ci{ 596762306a36Sopenharmony_ci struct page *page = NULL; 596862306a36Sopenharmony_ci enum mc_target_type ret = MC_TARGET_NONE; 596962306a36Sopenharmony_ci swp_entry_t ent = { .val = 0 }; 597062306a36Sopenharmony_ci 597162306a36Sopenharmony_ci if (pte_present(ptent)) 597262306a36Sopenharmony_ci page = mc_handle_present_pte(vma, addr, ptent); 597362306a36Sopenharmony_ci else if (pte_none_mostly(ptent)) 597462306a36Sopenharmony_ci /* 597562306a36Sopenharmony_ci * PTE markers should be treated as a none pte here, separated 597662306a36Sopenharmony_ci * from other swap handling below. 597762306a36Sopenharmony_ci */ 597862306a36Sopenharmony_ci page = mc_handle_file_pte(vma, addr, ptent); 597962306a36Sopenharmony_ci else if (is_swap_pte(ptent)) 598062306a36Sopenharmony_ci page = mc_handle_swap_pte(vma, ptent, &ent); 598162306a36Sopenharmony_ci 598262306a36Sopenharmony_ci if (target && page) { 598362306a36Sopenharmony_ci if (!trylock_page(page)) { 598462306a36Sopenharmony_ci put_page(page); 598562306a36Sopenharmony_ci return ret; 598662306a36Sopenharmony_ci } 598762306a36Sopenharmony_ci /* 598862306a36Sopenharmony_ci * page_mapped() must be stable during the move. This 598962306a36Sopenharmony_ci * pte is locked, so if it's present, the page cannot 599062306a36Sopenharmony_ci * become unmapped. If it isn't, we have only partial 599162306a36Sopenharmony_ci * control over the mapped state: the page lock will 599262306a36Sopenharmony_ci * prevent new faults against pagecache and swapcache, 599362306a36Sopenharmony_ci * so an unmapped page cannot become mapped. However, 599462306a36Sopenharmony_ci * if the page is already mapped elsewhere, it can 599562306a36Sopenharmony_ci * unmap, and there is nothing we can do about it. 599662306a36Sopenharmony_ci * Alas, skip moving the page in this case. 599762306a36Sopenharmony_ci */ 599862306a36Sopenharmony_ci if (!pte_present(ptent) && page_mapped(page)) { 599962306a36Sopenharmony_ci unlock_page(page); 600062306a36Sopenharmony_ci put_page(page); 600162306a36Sopenharmony_ci return ret; 600262306a36Sopenharmony_ci } 600362306a36Sopenharmony_ci } 600462306a36Sopenharmony_ci 600562306a36Sopenharmony_ci if (!page && !ent.val) 600662306a36Sopenharmony_ci return ret; 600762306a36Sopenharmony_ci if (page) { 600862306a36Sopenharmony_ci /* 600962306a36Sopenharmony_ci * Do only loose check w/o serialization. 601062306a36Sopenharmony_ci * mem_cgroup_move_account() checks the page is valid or 601162306a36Sopenharmony_ci * not under LRU exclusion. 601262306a36Sopenharmony_ci */ 601362306a36Sopenharmony_ci if (page_memcg(page) == mc.from) { 601462306a36Sopenharmony_ci ret = MC_TARGET_PAGE; 601562306a36Sopenharmony_ci if (is_device_private_page(page) || 601662306a36Sopenharmony_ci is_device_coherent_page(page)) 601762306a36Sopenharmony_ci ret = MC_TARGET_DEVICE; 601862306a36Sopenharmony_ci if (target) 601962306a36Sopenharmony_ci target->page = page; 602062306a36Sopenharmony_ci } 602162306a36Sopenharmony_ci if (!ret || !target) { 602262306a36Sopenharmony_ci if (target) 602362306a36Sopenharmony_ci unlock_page(page); 602462306a36Sopenharmony_ci put_page(page); 602562306a36Sopenharmony_ci } 602662306a36Sopenharmony_ci } 602762306a36Sopenharmony_ci /* 602862306a36Sopenharmony_ci * There is a swap entry and a page doesn't exist or isn't charged. 602962306a36Sopenharmony_ci * But we cannot move a tail-page in a THP. 603062306a36Sopenharmony_ci */ 603162306a36Sopenharmony_ci if (ent.val && !ret && (!page || !PageTransCompound(page)) && 603262306a36Sopenharmony_ci mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 603362306a36Sopenharmony_ci ret = MC_TARGET_SWAP; 603462306a36Sopenharmony_ci if (target) 603562306a36Sopenharmony_ci target->ent = ent; 603662306a36Sopenharmony_ci } 603762306a36Sopenharmony_ci return ret; 603862306a36Sopenharmony_ci} 603962306a36Sopenharmony_ci 604062306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 604162306a36Sopenharmony_ci/* 604262306a36Sopenharmony_ci * We don't consider PMD mapped swapping or file mapped pages because THP does 604362306a36Sopenharmony_ci * not support them for now. 604462306a36Sopenharmony_ci * Caller should make sure that pmd_trans_huge(pmd) is true. 604562306a36Sopenharmony_ci */ 604662306a36Sopenharmony_cistatic enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 604762306a36Sopenharmony_ci unsigned long addr, pmd_t pmd, union mc_target *target) 604862306a36Sopenharmony_ci{ 604962306a36Sopenharmony_ci struct page *page = NULL; 605062306a36Sopenharmony_ci enum mc_target_type ret = MC_TARGET_NONE; 605162306a36Sopenharmony_ci 605262306a36Sopenharmony_ci if (unlikely(is_swap_pmd(pmd))) { 605362306a36Sopenharmony_ci VM_BUG_ON(thp_migration_supported() && 605462306a36Sopenharmony_ci !is_pmd_migration_entry(pmd)); 605562306a36Sopenharmony_ci return ret; 605662306a36Sopenharmony_ci } 605762306a36Sopenharmony_ci page = pmd_page(pmd); 605862306a36Sopenharmony_ci VM_BUG_ON_PAGE(!page || !PageHead(page), page); 605962306a36Sopenharmony_ci if (!(mc.flags & MOVE_ANON)) 606062306a36Sopenharmony_ci return ret; 606162306a36Sopenharmony_ci if (page_memcg(page) == mc.from) { 606262306a36Sopenharmony_ci ret = MC_TARGET_PAGE; 606362306a36Sopenharmony_ci if (target) { 606462306a36Sopenharmony_ci get_page(page); 606562306a36Sopenharmony_ci if (!trylock_page(page)) { 606662306a36Sopenharmony_ci put_page(page); 606762306a36Sopenharmony_ci return MC_TARGET_NONE; 606862306a36Sopenharmony_ci } 606962306a36Sopenharmony_ci target->page = page; 607062306a36Sopenharmony_ci } 607162306a36Sopenharmony_ci } 607262306a36Sopenharmony_ci return ret; 607362306a36Sopenharmony_ci} 607462306a36Sopenharmony_ci#else 607562306a36Sopenharmony_cistatic inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 607662306a36Sopenharmony_ci unsigned long addr, pmd_t pmd, union mc_target *target) 607762306a36Sopenharmony_ci{ 607862306a36Sopenharmony_ci return MC_TARGET_NONE; 607962306a36Sopenharmony_ci} 608062306a36Sopenharmony_ci#endif 608162306a36Sopenharmony_ci 608262306a36Sopenharmony_cistatic int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 608362306a36Sopenharmony_ci unsigned long addr, unsigned long end, 608462306a36Sopenharmony_ci struct mm_walk *walk) 608562306a36Sopenharmony_ci{ 608662306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 608762306a36Sopenharmony_ci pte_t *pte; 608862306a36Sopenharmony_ci spinlock_t *ptl; 608962306a36Sopenharmony_ci 609062306a36Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 609162306a36Sopenharmony_ci if (ptl) { 609262306a36Sopenharmony_ci /* 609362306a36Sopenharmony_ci * Note their can not be MC_TARGET_DEVICE for now as we do not 609462306a36Sopenharmony_ci * support transparent huge page with MEMORY_DEVICE_PRIVATE but 609562306a36Sopenharmony_ci * this might change. 609662306a36Sopenharmony_ci */ 609762306a36Sopenharmony_ci if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 609862306a36Sopenharmony_ci mc.precharge += HPAGE_PMD_NR; 609962306a36Sopenharmony_ci spin_unlock(ptl); 610062306a36Sopenharmony_ci return 0; 610162306a36Sopenharmony_ci } 610262306a36Sopenharmony_ci 610362306a36Sopenharmony_ci pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 610462306a36Sopenharmony_ci if (!pte) 610562306a36Sopenharmony_ci return 0; 610662306a36Sopenharmony_ci for (; addr != end; pte++, addr += PAGE_SIZE) 610762306a36Sopenharmony_ci if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) 610862306a36Sopenharmony_ci mc.precharge++; /* increment precharge temporarily */ 610962306a36Sopenharmony_ci pte_unmap_unlock(pte - 1, ptl); 611062306a36Sopenharmony_ci cond_resched(); 611162306a36Sopenharmony_ci 611262306a36Sopenharmony_ci return 0; 611362306a36Sopenharmony_ci} 611462306a36Sopenharmony_ci 611562306a36Sopenharmony_cistatic const struct mm_walk_ops precharge_walk_ops = { 611662306a36Sopenharmony_ci .pmd_entry = mem_cgroup_count_precharge_pte_range, 611762306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 611862306a36Sopenharmony_ci}; 611962306a36Sopenharmony_ci 612062306a36Sopenharmony_cistatic unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 612162306a36Sopenharmony_ci{ 612262306a36Sopenharmony_ci unsigned long precharge; 612362306a36Sopenharmony_ci 612462306a36Sopenharmony_ci mmap_read_lock(mm); 612562306a36Sopenharmony_ci walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); 612662306a36Sopenharmony_ci mmap_read_unlock(mm); 612762306a36Sopenharmony_ci 612862306a36Sopenharmony_ci precharge = mc.precharge; 612962306a36Sopenharmony_ci mc.precharge = 0; 613062306a36Sopenharmony_ci 613162306a36Sopenharmony_ci return precharge; 613262306a36Sopenharmony_ci} 613362306a36Sopenharmony_ci 613462306a36Sopenharmony_cistatic int mem_cgroup_precharge_mc(struct mm_struct *mm) 613562306a36Sopenharmony_ci{ 613662306a36Sopenharmony_ci unsigned long precharge = mem_cgroup_count_precharge(mm); 613762306a36Sopenharmony_ci 613862306a36Sopenharmony_ci VM_BUG_ON(mc.moving_task); 613962306a36Sopenharmony_ci mc.moving_task = current; 614062306a36Sopenharmony_ci return mem_cgroup_do_precharge(precharge); 614162306a36Sopenharmony_ci} 614262306a36Sopenharmony_ci 614362306a36Sopenharmony_ci/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 614462306a36Sopenharmony_cistatic void __mem_cgroup_clear_mc(void) 614562306a36Sopenharmony_ci{ 614662306a36Sopenharmony_ci struct mem_cgroup *from = mc.from; 614762306a36Sopenharmony_ci struct mem_cgroup *to = mc.to; 614862306a36Sopenharmony_ci 614962306a36Sopenharmony_ci /* we must uncharge all the leftover precharges from mc.to */ 615062306a36Sopenharmony_ci if (mc.precharge) { 615162306a36Sopenharmony_ci cancel_charge(mc.to, mc.precharge); 615262306a36Sopenharmony_ci mc.precharge = 0; 615362306a36Sopenharmony_ci } 615462306a36Sopenharmony_ci /* 615562306a36Sopenharmony_ci * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 615662306a36Sopenharmony_ci * we must uncharge here. 615762306a36Sopenharmony_ci */ 615862306a36Sopenharmony_ci if (mc.moved_charge) { 615962306a36Sopenharmony_ci cancel_charge(mc.from, mc.moved_charge); 616062306a36Sopenharmony_ci mc.moved_charge = 0; 616162306a36Sopenharmony_ci } 616262306a36Sopenharmony_ci /* we must fixup refcnts and charges */ 616362306a36Sopenharmony_ci if (mc.moved_swap) { 616462306a36Sopenharmony_ci /* uncharge swap account from the old cgroup */ 616562306a36Sopenharmony_ci if (!mem_cgroup_is_root(mc.from)) 616662306a36Sopenharmony_ci page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 616762306a36Sopenharmony_ci 616862306a36Sopenharmony_ci mem_cgroup_id_put_many(mc.from, mc.moved_swap); 616962306a36Sopenharmony_ci 617062306a36Sopenharmony_ci /* 617162306a36Sopenharmony_ci * we charged both to->memory and to->memsw, so we 617262306a36Sopenharmony_ci * should uncharge to->memory. 617362306a36Sopenharmony_ci */ 617462306a36Sopenharmony_ci if (!mem_cgroup_is_root(mc.to)) 617562306a36Sopenharmony_ci page_counter_uncharge(&mc.to->memory, mc.moved_swap); 617662306a36Sopenharmony_ci 617762306a36Sopenharmony_ci mc.moved_swap = 0; 617862306a36Sopenharmony_ci } 617962306a36Sopenharmony_ci memcg_oom_recover(from); 618062306a36Sopenharmony_ci memcg_oom_recover(to); 618162306a36Sopenharmony_ci wake_up_all(&mc.waitq); 618262306a36Sopenharmony_ci} 618362306a36Sopenharmony_ci 618462306a36Sopenharmony_cistatic void mem_cgroup_clear_mc(void) 618562306a36Sopenharmony_ci{ 618662306a36Sopenharmony_ci struct mm_struct *mm = mc.mm; 618762306a36Sopenharmony_ci 618862306a36Sopenharmony_ci /* 618962306a36Sopenharmony_ci * we must clear moving_task before waking up waiters at the end of 619062306a36Sopenharmony_ci * task migration. 619162306a36Sopenharmony_ci */ 619262306a36Sopenharmony_ci mc.moving_task = NULL; 619362306a36Sopenharmony_ci __mem_cgroup_clear_mc(); 619462306a36Sopenharmony_ci spin_lock(&mc.lock); 619562306a36Sopenharmony_ci mc.from = NULL; 619662306a36Sopenharmony_ci mc.to = NULL; 619762306a36Sopenharmony_ci mc.mm = NULL; 619862306a36Sopenharmony_ci spin_unlock(&mc.lock); 619962306a36Sopenharmony_ci 620062306a36Sopenharmony_ci mmput(mm); 620162306a36Sopenharmony_ci} 620262306a36Sopenharmony_ci 620362306a36Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset) 620462306a36Sopenharmony_ci{ 620562306a36Sopenharmony_ci struct cgroup_subsys_state *css; 620662306a36Sopenharmony_ci struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 620762306a36Sopenharmony_ci struct mem_cgroup *from; 620862306a36Sopenharmony_ci struct task_struct *leader, *p; 620962306a36Sopenharmony_ci struct mm_struct *mm; 621062306a36Sopenharmony_ci unsigned long move_flags; 621162306a36Sopenharmony_ci int ret = 0; 621262306a36Sopenharmony_ci 621362306a36Sopenharmony_ci /* charge immigration isn't supported on the default hierarchy */ 621462306a36Sopenharmony_ci if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 621562306a36Sopenharmony_ci return 0; 621662306a36Sopenharmony_ci 621762306a36Sopenharmony_ci /* 621862306a36Sopenharmony_ci * Multi-process migrations only happen on the default hierarchy 621962306a36Sopenharmony_ci * where charge immigration is not used. Perform charge 622062306a36Sopenharmony_ci * immigration if @tset contains a leader and whine if there are 622162306a36Sopenharmony_ci * multiple. 622262306a36Sopenharmony_ci */ 622362306a36Sopenharmony_ci p = NULL; 622462306a36Sopenharmony_ci cgroup_taskset_for_each_leader(leader, css, tset) { 622562306a36Sopenharmony_ci WARN_ON_ONCE(p); 622662306a36Sopenharmony_ci p = leader; 622762306a36Sopenharmony_ci memcg = mem_cgroup_from_css(css); 622862306a36Sopenharmony_ci } 622962306a36Sopenharmony_ci if (!p) 623062306a36Sopenharmony_ci return 0; 623162306a36Sopenharmony_ci 623262306a36Sopenharmony_ci /* 623362306a36Sopenharmony_ci * We are now committed to this value whatever it is. Changes in this 623462306a36Sopenharmony_ci * tunable will only affect upcoming migrations, not the current one. 623562306a36Sopenharmony_ci * So we need to save it, and keep it going. 623662306a36Sopenharmony_ci */ 623762306a36Sopenharmony_ci move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 623862306a36Sopenharmony_ci if (!move_flags) 623962306a36Sopenharmony_ci return 0; 624062306a36Sopenharmony_ci 624162306a36Sopenharmony_ci from = mem_cgroup_from_task(p); 624262306a36Sopenharmony_ci 624362306a36Sopenharmony_ci VM_BUG_ON(from == memcg); 624462306a36Sopenharmony_ci 624562306a36Sopenharmony_ci mm = get_task_mm(p); 624662306a36Sopenharmony_ci if (!mm) 624762306a36Sopenharmony_ci return 0; 624862306a36Sopenharmony_ci /* We move charges only when we move a owner of the mm */ 624962306a36Sopenharmony_ci if (mm->owner == p) { 625062306a36Sopenharmony_ci VM_BUG_ON(mc.from); 625162306a36Sopenharmony_ci VM_BUG_ON(mc.to); 625262306a36Sopenharmony_ci VM_BUG_ON(mc.precharge); 625362306a36Sopenharmony_ci VM_BUG_ON(mc.moved_charge); 625462306a36Sopenharmony_ci VM_BUG_ON(mc.moved_swap); 625562306a36Sopenharmony_ci 625662306a36Sopenharmony_ci spin_lock(&mc.lock); 625762306a36Sopenharmony_ci mc.mm = mm; 625862306a36Sopenharmony_ci mc.from = from; 625962306a36Sopenharmony_ci mc.to = memcg; 626062306a36Sopenharmony_ci mc.flags = move_flags; 626162306a36Sopenharmony_ci spin_unlock(&mc.lock); 626262306a36Sopenharmony_ci /* We set mc.moving_task later */ 626362306a36Sopenharmony_ci 626462306a36Sopenharmony_ci ret = mem_cgroup_precharge_mc(mm); 626562306a36Sopenharmony_ci if (ret) 626662306a36Sopenharmony_ci mem_cgroup_clear_mc(); 626762306a36Sopenharmony_ci } else { 626862306a36Sopenharmony_ci mmput(mm); 626962306a36Sopenharmony_ci } 627062306a36Sopenharmony_ci return ret; 627162306a36Sopenharmony_ci} 627262306a36Sopenharmony_ci 627362306a36Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 627462306a36Sopenharmony_ci{ 627562306a36Sopenharmony_ci if (mc.to) 627662306a36Sopenharmony_ci mem_cgroup_clear_mc(); 627762306a36Sopenharmony_ci} 627862306a36Sopenharmony_ci 627962306a36Sopenharmony_cistatic int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 628062306a36Sopenharmony_ci unsigned long addr, unsigned long end, 628162306a36Sopenharmony_ci struct mm_walk *walk) 628262306a36Sopenharmony_ci{ 628362306a36Sopenharmony_ci int ret = 0; 628462306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 628562306a36Sopenharmony_ci pte_t *pte; 628662306a36Sopenharmony_ci spinlock_t *ptl; 628762306a36Sopenharmony_ci enum mc_target_type target_type; 628862306a36Sopenharmony_ci union mc_target target; 628962306a36Sopenharmony_ci struct page *page; 629062306a36Sopenharmony_ci 629162306a36Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 629262306a36Sopenharmony_ci if (ptl) { 629362306a36Sopenharmony_ci if (mc.precharge < HPAGE_PMD_NR) { 629462306a36Sopenharmony_ci spin_unlock(ptl); 629562306a36Sopenharmony_ci return 0; 629662306a36Sopenharmony_ci } 629762306a36Sopenharmony_ci target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 629862306a36Sopenharmony_ci if (target_type == MC_TARGET_PAGE) { 629962306a36Sopenharmony_ci page = target.page; 630062306a36Sopenharmony_ci if (isolate_lru_page(page)) { 630162306a36Sopenharmony_ci if (!mem_cgroup_move_account(page, true, 630262306a36Sopenharmony_ci mc.from, mc.to)) { 630362306a36Sopenharmony_ci mc.precharge -= HPAGE_PMD_NR; 630462306a36Sopenharmony_ci mc.moved_charge += HPAGE_PMD_NR; 630562306a36Sopenharmony_ci } 630662306a36Sopenharmony_ci putback_lru_page(page); 630762306a36Sopenharmony_ci } 630862306a36Sopenharmony_ci unlock_page(page); 630962306a36Sopenharmony_ci put_page(page); 631062306a36Sopenharmony_ci } else if (target_type == MC_TARGET_DEVICE) { 631162306a36Sopenharmony_ci page = target.page; 631262306a36Sopenharmony_ci if (!mem_cgroup_move_account(page, true, 631362306a36Sopenharmony_ci mc.from, mc.to)) { 631462306a36Sopenharmony_ci mc.precharge -= HPAGE_PMD_NR; 631562306a36Sopenharmony_ci mc.moved_charge += HPAGE_PMD_NR; 631662306a36Sopenharmony_ci } 631762306a36Sopenharmony_ci unlock_page(page); 631862306a36Sopenharmony_ci put_page(page); 631962306a36Sopenharmony_ci } 632062306a36Sopenharmony_ci spin_unlock(ptl); 632162306a36Sopenharmony_ci return 0; 632262306a36Sopenharmony_ci } 632362306a36Sopenharmony_ci 632462306a36Sopenharmony_ciretry: 632562306a36Sopenharmony_ci pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 632662306a36Sopenharmony_ci if (!pte) 632762306a36Sopenharmony_ci return 0; 632862306a36Sopenharmony_ci for (; addr != end; addr += PAGE_SIZE) { 632962306a36Sopenharmony_ci pte_t ptent = ptep_get(pte++); 633062306a36Sopenharmony_ci bool device = false; 633162306a36Sopenharmony_ci swp_entry_t ent; 633262306a36Sopenharmony_ci 633362306a36Sopenharmony_ci if (!mc.precharge) 633462306a36Sopenharmony_ci break; 633562306a36Sopenharmony_ci 633662306a36Sopenharmony_ci switch (get_mctgt_type(vma, addr, ptent, &target)) { 633762306a36Sopenharmony_ci case MC_TARGET_DEVICE: 633862306a36Sopenharmony_ci device = true; 633962306a36Sopenharmony_ci fallthrough; 634062306a36Sopenharmony_ci case MC_TARGET_PAGE: 634162306a36Sopenharmony_ci page = target.page; 634262306a36Sopenharmony_ci /* 634362306a36Sopenharmony_ci * We can have a part of the split pmd here. Moving it 634462306a36Sopenharmony_ci * can be done but it would be too convoluted so simply 634562306a36Sopenharmony_ci * ignore such a partial THP and keep it in original 634662306a36Sopenharmony_ci * memcg. There should be somebody mapping the head. 634762306a36Sopenharmony_ci */ 634862306a36Sopenharmony_ci if (PageTransCompound(page)) 634962306a36Sopenharmony_ci goto put; 635062306a36Sopenharmony_ci if (!device && !isolate_lru_page(page)) 635162306a36Sopenharmony_ci goto put; 635262306a36Sopenharmony_ci if (!mem_cgroup_move_account(page, false, 635362306a36Sopenharmony_ci mc.from, mc.to)) { 635462306a36Sopenharmony_ci mc.precharge--; 635562306a36Sopenharmony_ci /* we uncharge from mc.from later. */ 635662306a36Sopenharmony_ci mc.moved_charge++; 635762306a36Sopenharmony_ci } 635862306a36Sopenharmony_ci if (!device) 635962306a36Sopenharmony_ci putback_lru_page(page); 636062306a36Sopenharmony_ciput: /* get_mctgt_type() gets & locks the page */ 636162306a36Sopenharmony_ci unlock_page(page); 636262306a36Sopenharmony_ci put_page(page); 636362306a36Sopenharmony_ci break; 636462306a36Sopenharmony_ci case MC_TARGET_SWAP: 636562306a36Sopenharmony_ci ent = target.ent; 636662306a36Sopenharmony_ci if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 636762306a36Sopenharmony_ci mc.precharge--; 636862306a36Sopenharmony_ci mem_cgroup_id_get_many(mc.to, 1); 636962306a36Sopenharmony_ci /* we fixup other refcnts and charges later. */ 637062306a36Sopenharmony_ci mc.moved_swap++; 637162306a36Sopenharmony_ci } 637262306a36Sopenharmony_ci break; 637362306a36Sopenharmony_ci default: 637462306a36Sopenharmony_ci break; 637562306a36Sopenharmony_ci } 637662306a36Sopenharmony_ci } 637762306a36Sopenharmony_ci pte_unmap_unlock(pte - 1, ptl); 637862306a36Sopenharmony_ci cond_resched(); 637962306a36Sopenharmony_ci 638062306a36Sopenharmony_ci if (addr != end) { 638162306a36Sopenharmony_ci /* 638262306a36Sopenharmony_ci * We have consumed all precharges we got in can_attach(). 638362306a36Sopenharmony_ci * We try charge one by one, but don't do any additional 638462306a36Sopenharmony_ci * charges to mc.to if we have failed in charge once in attach() 638562306a36Sopenharmony_ci * phase. 638662306a36Sopenharmony_ci */ 638762306a36Sopenharmony_ci ret = mem_cgroup_do_precharge(1); 638862306a36Sopenharmony_ci if (!ret) 638962306a36Sopenharmony_ci goto retry; 639062306a36Sopenharmony_ci } 639162306a36Sopenharmony_ci 639262306a36Sopenharmony_ci return ret; 639362306a36Sopenharmony_ci} 639462306a36Sopenharmony_ci 639562306a36Sopenharmony_cistatic const struct mm_walk_ops charge_walk_ops = { 639662306a36Sopenharmony_ci .pmd_entry = mem_cgroup_move_charge_pte_range, 639762306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 639862306a36Sopenharmony_ci}; 639962306a36Sopenharmony_ci 640062306a36Sopenharmony_cistatic void mem_cgroup_move_charge(void) 640162306a36Sopenharmony_ci{ 640262306a36Sopenharmony_ci lru_add_drain_all(); 640362306a36Sopenharmony_ci /* 640462306a36Sopenharmony_ci * Signal folio_memcg_lock() to take the memcg's move_lock 640562306a36Sopenharmony_ci * while we're moving its pages to another memcg. Then wait 640662306a36Sopenharmony_ci * for already started RCU-only updates to finish. 640762306a36Sopenharmony_ci */ 640862306a36Sopenharmony_ci atomic_inc(&mc.from->moving_account); 640962306a36Sopenharmony_ci synchronize_rcu(); 641062306a36Sopenharmony_ciretry: 641162306a36Sopenharmony_ci if (unlikely(!mmap_read_trylock(mc.mm))) { 641262306a36Sopenharmony_ci /* 641362306a36Sopenharmony_ci * Someone who are holding the mmap_lock might be waiting in 641462306a36Sopenharmony_ci * waitq. So we cancel all extra charges, wake up all waiters, 641562306a36Sopenharmony_ci * and retry. Because we cancel precharges, we might not be able 641662306a36Sopenharmony_ci * to move enough charges, but moving charge is a best-effort 641762306a36Sopenharmony_ci * feature anyway, so it wouldn't be a big problem. 641862306a36Sopenharmony_ci */ 641962306a36Sopenharmony_ci __mem_cgroup_clear_mc(); 642062306a36Sopenharmony_ci cond_resched(); 642162306a36Sopenharmony_ci goto retry; 642262306a36Sopenharmony_ci } 642362306a36Sopenharmony_ci /* 642462306a36Sopenharmony_ci * When we have consumed all precharges and failed in doing 642562306a36Sopenharmony_ci * additional charge, the page walk just aborts. 642662306a36Sopenharmony_ci */ 642762306a36Sopenharmony_ci walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); 642862306a36Sopenharmony_ci mmap_read_unlock(mc.mm); 642962306a36Sopenharmony_ci atomic_dec(&mc.from->moving_account); 643062306a36Sopenharmony_ci} 643162306a36Sopenharmony_ci 643262306a36Sopenharmony_cistatic void mem_cgroup_move_task(void) 643362306a36Sopenharmony_ci{ 643462306a36Sopenharmony_ci if (mc.to) { 643562306a36Sopenharmony_ci mem_cgroup_move_charge(); 643662306a36Sopenharmony_ci mem_cgroup_clear_mc(); 643762306a36Sopenharmony_ci } 643862306a36Sopenharmony_ci} 643962306a36Sopenharmony_ci#else /* !CONFIG_MMU */ 644062306a36Sopenharmony_cistatic int mem_cgroup_can_attach(struct cgroup_taskset *tset) 644162306a36Sopenharmony_ci{ 644262306a36Sopenharmony_ci return 0; 644362306a36Sopenharmony_ci} 644462306a36Sopenharmony_cistatic void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 644562306a36Sopenharmony_ci{ 644662306a36Sopenharmony_ci} 644762306a36Sopenharmony_cistatic void mem_cgroup_move_task(void) 644862306a36Sopenharmony_ci{ 644962306a36Sopenharmony_ci} 645062306a36Sopenharmony_ci#endif 645162306a36Sopenharmony_ci 645262306a36Sopenharmony_ci#ifdef CONFIG_LRU_GEN 645362306a36Sopenharmony_cistatic void mem_cgroup_attach(struct cgroup_taskset *tset) 645462306a36Sopenharmony_ci{ 645562306a36Sopenharmony_ci struct task_struct *task; 645662306a36Sopenharmony_ci struct cgroup_subsys_state *css; 645762306a36Sopenharmony_ci 645862306a36Sopenharmony_ci /* find the first leader if there is any */ 645962306a36Sopenharmony_ci cgroup_taskset_for_each_leader(task, css, tset) 646062306a36Sopenharmony_ci break; 646162306a36Sopenharmony_ci 646262306a36Sopenharmony_ci if (!task) 646362306a36Sopenharmony_ci return; 646462306a36Sopenharmony_ci 646562306a36Sopenharmony_ci task_lock(task); 646662306a36Sopenharmony_ci if (task->mm && READ_ONCE(task->mm->owner) == task) 646762306a36Sopenharmony_ci lru_gen_migrate_mm(task->mm); 646862306a36Sopenharmony_ci task_unlock(task); 646962306a36Sopenharmony_ci} 647062306a36Sopenharmony_ci#else 647162306a36Sopenharmony_cistatic void mem_cgroup_attach(struct cgroup_taskset *tset) 647262306a36Sopenharmony_ci{ 647362306a36Sopenharmony_ci} 647462306a36Sopenharmony_ci#endif /* CONFIG_LRU_GEN */ 647562306a36Sopenharmony_ci 647662306a36Sopenharmony_cistatic int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 647762306a36Sopenharmony_ci{ 647862306a36Sopenharmony_ci if (value == PAGE_COUNTER_MAX) 647962306a36Sopenharmony_ci seq_puts(m, "max\n"); 648062306a36Sopenharmony_ci else 648162306a36Sopenharmony_ci seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 648262306a36Sopenharmony_ci 648362306a36Sopenharmony_ci return 0; 648462306a36Sopenharmony_ci} 648562306a36Sopenharmony_ci 648662306a36Sopenharmony_cistatic u64 memory_current_read(struct cgroup_subsys_state *css, 648762306a36Sopenharmony_ci struct cftype *cft) 648862306a36Sopenharmony_ci{ 648962306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 649062306a36Sopenharmony_ci 649162306a36Sopenharmony_ci return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 649262306a36Sopenharmony_ci} 649362306a36Sopenharmony_ci 649462306a36Sopenharmony_cistatic u64 memory_peak_read(struct cgroup_subsys_state *css, 649562306a36Sopenharmony_ci struct cftype *cft) 649662306a36Sopenharmony_ci{ 649762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 649862306a36Sopenharmony_ci 649962306a36Sopenharmony_ci return (u64)memcg->memory.watermark * PAGE_SIZE; 650062306a36Sopenharmony_ci} 650162306a36Sopenharmony_ci 650262306a36Sopenharmony_cistatic int memory_min_show(struct seq_file *m, void *v) 650362306a36Sopenharmony_ci{ 650462306a36Sopenharmony_ci return seq_puts_memcg_tunable(m, 650562306a36Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 650662306a36Sopenharmony_ci} 650762306a36Sopenharmony_ci 650862306a36Sopenharmony_cistatic ssize_t memory_min_write(struct kernfs_open_file *of, 650962306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 651062306a36Sopenharmony_ci{ 651162306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 651262306a36Sopenharmony_ci unsigned long min; 651362306a36Sopenharmony_ci int err; 651462306a36Sopenharmony_ci 651562306a36Sopenharmony_ci buf = strstrip(buf); 651662306a36Sopenharmony_ci err = page_counter_memparse(buf, "max", &min); 651762306a36Sopenharmony_ci if (err) 651862306a36Sopenharmony_ci return err; 651962306a36Sopenharmony_ci 652062306a36Sopenharmony_ci page_counter_set_min(&memcg->memory, min); 652162306a36Sopenharmony_ci 652262306a36Sopenharmony_ci return nbytes; 652362306a36Sopenharmony_ci} 652462306a36Sopenharmony_ci 652562306a36Sopenharmony_cistatic int memory_low_show(struct seq_file *m, void *v) 652662306a36Sopenharmony_ci{ 652762306a36Sopenharmony_ci return seq_puts_memcg_tunable(m, 652862306a36Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 652962306a36Sopenharmony_ci} 653062306a36Sopenharmony_ci 653162306a36Sopenharmony_cistatic ssize_t memory_low_write(struct kernfs_open_file *of, 653262306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 653362306a36Sopenharmony_ci{ 653462306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 653562306a36Sopenharmony_ci unsigned long low; 653662306a36Sopenharmony_ci int err; 653762306a36Sopenharmony_ci 653862306a36Sopenharmony_ci buf = strstrip(buf); 653962306a36Sopenharmony_ci err = page_counter_memparse(buf, "max", &low); 654062306a36Sopenharmony_ci if (err) 654162306a36Sopenharmony_ci return err; 654262306a36Sopenharmony_ci 654362306a36Sopenharmony_ci page_counter_set_low(&memcg->memory, low); 654462306a36Sopenharmony_ci 654562306a36Sopenharmony_ci return nbytes; 654662306a36Sopenharmony_ci} 654762306a36Sopenharmony_ci 654862306a36Sopenharmony_cistatic int memory_high_show(struct seq_file *m, void *v) 654962306a36Sopenharmony_ci{ 655062306a36Sopenharmony_ci return seq_puts_memcg_tunable(m, 655162306a36Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 655262306a36Sopenharmony_ci} 655362306a36Sopenharmony_ci 655462306a36Sopenharmony_cistatic ssize_t memory_high_write(struct kernfs_open_file *of, 655562306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 655662306a36Sopenharmony_ci{ 655762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 655862306a36Sopenharmony_ci unsigned int nr_retries = MAX_RECLAIM_RETRIES; 655962306a36Sopenharmony_ci bool drained = false; 656062306a36Sopenharmony_ci unsigned long high; 656162306a36Sopenharmony_ci int err; 656262306a36Sopenharmony_ci 656362306a36Sopenharmony_ci buf = strstrip(buf); 656462306a36Sopenharmony_ci err = page_counter_memparse(buf, "max", &high); 656562306a36Sopenharmony_ci if (err) 656662306a36Sopenharmony_ci return err; 656762306a36Sopenharmony_ci 656862306a36Sopenharmony_ci page_counter_set_high(&memcg->memory, high); 656962306a36Sopenharmony_ci 657062306a36Sopenharmony_ci for (;;) { 657162306a36Sopenharmony_ci unsigned long nr_pages = page_counter_read(&memcg->memory); 657262306a36Sopenharmony_ci unsigned long reclaimed; 657362306a36Sopenharmony_ci 657462306a36Sopenharmony_ci if (nr_pages <= high) 657562306a36Sopenharmony_ci break; 657662306a36Sopenharmony_ci 657762306a36Sopenharmony_ci if (signal_pending(current)) 657862306a36Sopenharmony_ci break; 657962306a36Sopenharmony_ci 658062306a36Sopenharmony_ci if (!drained) { 658162306a36Sopenharmony_ci drain_all_stock(memcg); 658262306a36Sopenharmony_ci drained = true; 658362306a36Sopenharmony_ci continue; 658462306a36Sopenharmony_ci } 658562306a36Sopenharmony_ci 658662306a36Sopenharmony_ci reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 658762306a36Sopenharmony_ci GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); 658862306a36Sopenharmony_ci 658962306a36Sopenharmony_ci if (!reclaimed && !nr_retries--) 659062306a36Sopenharmony_ci break; 659162306a36Sopenharmony_ci } 659262306a36Sopenharmony_ci 659362306a36Sopenharmony_ci memcg_wb_domain_size_changed(memcg); 659462306a36Sopenharmony_ci return nbytes; 659562306a36Sopenharmony_ci} 659662306a36Sopenharmony_ci 659762306a36Sopenharmony_cistatic int memory_max_show(struct seq_file *m, void *v) 659862306a36Sopenharmony_ci{ 659962306a36Sopenharmony_ci return seq_puts_memcg_tunable(m, 660062306a36Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 660162306a36Sopenharmony_ci} 660262306a36Sopenharmony_ci 660362306a36Sopenharmony_cistatic ssize_t memory_max_write(struct kernfs_open_file *of, 660462306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 660562306a36Sopenharmony_ci{ 660662306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 660762306a36Sopenharmony_ci unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 660862306a36Sopenharmony_ci bool drained = false; 660962306a36Sopenharmony_ci unsigned long max; 661062306a36Sopenharmony_ci int err; 661162306a36Sopenharmony_ci 661262306a36Sopenharmony_ci buf = strstrip(buf); 661362306a36Sopenharmony_ci err = page_counter_memparse(buf, "max", &max); 661462306a36Sopenharmony_ci if (err) 661562306a36Sopenharmony_ci return err; 661662306a36Sopenharmony_ci 661762306a36Sopenharmony_ci xchg(&memcg->memory.max, max); 661862306a36Sopenharmony_ci 661962306a36Sopenharmony_ci for (;;) { 662062306a36Sopenharmony_ci unsigned long nr_pages = page_counter_read(&memcg->memory); 662162306a36Sopenharmony_ci 662262306a36Sopenharmony_ci if (nr_pages <= max) 662362306a36Sopenharmony_ci break; 662462306a36Sopenharmony_ci 662562306a36Sopenharmony_ci if (signal_pending(current)) 662662306a36Sopenharmony_ci break; 662762306a36Sopenharmony_ci 662862306a36Sopenharmony_ci if (!drained) { 662962306a36Sopenharmony_ci drain_all_stock(memcg); 663062306a36Sopenharmony_ci drained = true; 663162306a36Sopenharmony_ci continue; 663262306a36Sopenharmony_ci } 663362306a36Sopenharmony_ci 663462306a36Sopenharmony_ci if (nr_reclaims) { 663562306a36Sopenharmony_ci if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 663662306a36Sopenharmony_ci GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) 663762306a36Sopenharmony_ci nr_reclaims--; 663862306a36Sopenharmony_ci continue; 663962306a36Sopenharmony_ci } 664062306a36Sopenharmony_ci 664162306a36Sopenharmony_ci memcg_memory_event(memcg, MEMCG_OOM); 664262306a36Sopenharmony_ci if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 664362306a36Sopenharmony_ci break; 664462306a36Sopenharmony_ci } 664562306a36Sopenharmony_ci 664662306a36Sopenharmony_ci memcg_wb_domain_size_changed(memcg); 664762306a36Sopenharmony_ci return nbytes; 664862306a36Sopenharmony_ci} 664962306a36Sopenharmony_ci 665062306a36Sopenharmony_cistatic void __memory_events_show(struct seq_file *m, atomic_long_t *events) 665162306a36Sopenharmony_ci{ 665262306a36Sopenharmony_ci seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 665362306a36Sopenharmony_ci seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 665462306a36Sopenharmony_ci seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 665562306a36Sopenharmony_ci seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 665662306a36Sopenharmony_ci seq_printf(m, "oom_kill %lu\n", 665762306a36Sopenharmony_ci atomic_long_read(&events[MEMCG_OOM_KILL])); 665862306a36Sopenharmony_ci seq_printf(m, "oom_group_kill %lu\n", 665962306a36Sopenharmony_ci atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); 666062306a36Sopenharmony_ci} 666162306a36Sopenharmony_ci 666262306a36Sopenharmony_cistatic int memory_events_show(struct seq_file *m, void *v) 666362306a36Sopenharmony_ci{ 666462306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 666562306a36Sopenharmony_ci 666662306a36Sopenharmony_ci __memory_events_show(m, memcg->memory_events); 666762306a36Sopenharmony_ci return 0; 666862306a36Sopenharmony_ci} 666962306a36Sopenharmony_ci 667062306a36Sopenharmony_cistatic int memory_events_local_show(struct seq_file *m, void *v) 667162306a36Sopenharmony_ci{ 667262306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 667362306a36Sopenharmony_ci 667462306a36Sopenharmony_ci __memory_events_show(m, memcg->memory_events_local); 667562306a36Sopenharmony_ci return 0; 667662306a36Sopenharmony_ci} 667762306a36Sopenharmony_ci 667862306a36Sopenharmony_cistatic int memory_stat_show(struct seq_file *m, void *v) 667962306a36Sopenharmony_ci{ 668062306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 668162306a36Sopenharmony_ci char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 668262306a36Sopenharmony_ci struct seq_buf s; 668362306a36Sopenharmony_ci 668462306a36Sopenharmony_ci if (!buf) 668562306a36Sopenharmony_ci return -ENOMEM; 668662306a36Sopenharmony_ci seq_buf_init(&s, buf, PAGE_SIZE); 668762306a36Sopenharmony_ci memory_stat_format(memcg, &s); 668862306a36Sopenharmony_ci seq_puts(m, buf); 668962306a36Sopenharmony_ci kfree(buf); 669062306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_DEBUG 669162306a36Sopenharmony_ci memcg_eswap_info_show(m); 669262306a36Sopenharmony_ci#endif 669362306a36Sopenharmony_ci return 0; 669462306a36Sopenharmony_ci} 669562306a36Sopenharmony_ci 669662306a36Sopenharmony_ci#ifdef CONFIG_NUMA 669762306a36Sopenharmony_cistatic inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 669862306a36Sopenharmony_ci int item) 669962306a36Sopenharmony_ci{ 670062306a36Sopenharmony_ci return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); 670162306a36Sopenharmony_ci} 670262306a36Sopenharmony_ci 670362306a36Sopenharmony_cistatic int memory_numa_stat_show(struct seq_file *m, void *v) 670462306a36Sopenharmony_ci{ 670562306a36Sopenharmony_ci int i; 670662306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 670762306a36Sopenharmony_ci 670862306a36Sopenharmony_ci mem_cgroup_flush_stats(); 670962306a36Sopenharmony_ci 671062306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 671162306a36Sopenharmony_ci int nid; 671262306a36Sopenharmony_ci 671362306a36Sopenharmony_ci if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 671462306a36Sopenharmony_ci continue; 671562306a36Sopenharmony_ci 671662306a36Sopenharmony_ci seq_printf(m, "%s", memory_stats[i].name); 671762306a36Sopenharmony_ci for_each_node_state(nid, N_MEMORY) { 671862306a36Sopenharmony_ci u64 size; 671962306a36Sopenharmony_ci struct lruvec *lruvec; 672062306a36Sopenharmony_ci 672162306a36Sopenharmony_ci lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 672262306a36Sopenharmony_ci size = lruvec_page_state_output(lruvec, 672362306a36Sopenharmony_ci memory_stats[i].idx); 672462306a36Sopenharmony_ci seq_printf(m, " N%d=%llu", nid, size); 672562306a36Sopenharmony_ci } 672662306a36Sopenharmony_ci seq_putc(m, '\n'); 672762306a36Sopenharmony_ci } 672862306a36Sopenharmony_ci 672962306a36Sopenharmony_ci return 0; 673062306a36Sopenharmony_ci} 673162306a36Sopenharmony_ci#endif 673262306a36Sopenharmony_ci 673362306a36Sopenharmony_cistatic int memory_oom_group_show(struct seq_file *m, void *v) 673462306a36Sopenharmony_ci{ 673562306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 673662306a36Sopenharmony_ci 673762306a36Sopenharmony_ci seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group)); 673862306a36Sopenharmony_ci 673962306a36Sopenharmony_ci return 0; 674062306a36Sopenharmony_ci} 674162306a36Sopenharmony_ci 674262306a36Sopenharmony_cistatic ssize_t memory_oom_group_write(struct kernfs_open_file *of, 674362306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 674462306a36Sopenharmony_ci{ 674562306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 674662306a36Sopenharmony_ci int ret, oom_group; 674762306a36Sopenharmony_ci 674862306a36Sopenharmony_ci buf = strstrip(buf); 674962306a36Sopenharmony_ci if (!buf) 675062306a36Sopenharmony_ci return -EINVAL; 675162306a36Sopenharmony_ci 675262306a36Sopenharmony_ci ret = kstrtoint(buf, 0, &oom_group); 675362306a36Sopenharmony_ci if (ret) 675462306a36Sopenharmony_ci return ret; 675562306a36Sopenharmony_ci 675662306a36Sopenharmony_ci if (oom_group != 0 && oom_group != 1) 675762306a36Sopenharmony_ci return -EINVAL; 675862306a36Sopenharmony_ci 675962306a36Sopenharmony_ci WRITE_ONCE(memcg->oom_group, oom_group); 676062306a36Sopenharmony_ci 676162306a36Sopenharmony_ci return nbytes; 676262306a36Sopenharmony_ci} 676362306a36Sopenharmony_ci 676462306a36Sopenharmony_cistatic ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, 676562306a36Sopenharmony_ci size_t nbytes, loff_t off) 676662306a36Sopenharmony_ci{ 676762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 676862306a36Sopenharmony_ci unsigned int nr_retries = MAX_RECLAIM_RETRIES; 676962306a36Sopenharmony_ci unsigned long nr_to_reclaim, nr_reclaimed = 0; 677062306a36Sopenharmony_ci unsigned int reclaim_options; 677162306a36Sopenharmony_ci int err; 677262306a36Sopenharmony_ci 677362306a36Sopenharmony_ci buf = strstrip(buf); 677462306a36Sopenharmony_ci err = page_counter_memparse(buf, "", &nr_to_reclaim); 677562306a36Sopenharmony_ci if (err) 677662306a36Sopenharmony_ci return err; 677762306a36Sopenharmony_ci 677862306a36Sopenharmony_ci reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; 677962306a36Sopenharmony_ci while (nr_reclaimed < nr_to_reclaim) { 678062306a36Sopenharmony_ci unsigned long reclaimed; 678162306a36Sopenharmony_ci 678262306a36Sopenharmony_ci if (signal_pending(current)) 678362306a36Sopenharmony_ci return -EINTR; 678462306a36Sopenharmony_ci 678562306a36Sopenharmony_ci /* 678662306a36Sopenharmony_ci * This is the final attempt, drain percpu lru caches in the 678762306a36Sopenharmony_ci * hope of introducing more evictable pages for 678862306a36Sopenharmony_ci * try_to_free_mem_cgroup_pages(). 678962306a36Sopenharmony_ci */ 679062306a36Sopenharmony_ci if (!nr_retries) 679162306a36Sopenharmony_ci lru_add_drain_all(); 679262306a36Sopenharmony_ci 679362306a36Sopenharmony_ci reclaimed = try_to_free_mem_cgroup_pages(memcg, 679462306a36Sopenharmony_ci min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), 679562306a36Sopenharmony_ci GFP_KERNEL, reclaim_options); 679662306a36Sopenharmony_ci 679762306a36Sopenharmony_ci if (!reclaimed && !nr_retries--) 679862306a36Sopenharmony_ci return -EAGAIN; 679962306a36Sopenharmony_ci 680062306a36Sopenharmony_ci nr_reclaimed += reclaimed; 680162306a36Sopenharmony_ci } 680262306a36Sopenharmony_ci 680362306a36Sopenharmony_ci return nbytes; 680462306a36Sopenharmony_ci} 680562306a36Sopenharmony_ci 680662306a36Sopenharmony_cistatic struct cftype memory_files[] = { 680762306a36Sopenharmony_ci { 680862306a36Sopenharmony_ci .name = "current", 680962306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 681062306a36Sopenharmony_ci .read_u64 = memory_current_read, 681162306a36Sopenharmony_ci }, 681262306a36Sopenharmony_ci { 681362306a36Sopenharmony_ci .name = "peak", 681462306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 681562306a36Sopenharmony_ci .read_u64 = memory_peak_read, 681662306a36Sopenharmony_ci }, 681762306a36Sopenharmony_ci { 681862306a36Sopenharmony_ci .name = "min", 681962306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 682062306a36Sopenharmony_ci .seq_show = memory_min_show, 682162306a36Sopenharmony_ci .write = memory_min_write, 682262306a36Sopenharmony_ci }, 682362306a36Sopenharmony_ci { 682462306a36Sopenharmony_ci .name = "low", 682562306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 682662306a36Sopenharmony_ci .seq_show = memory_low_show, 682762306a36Sopenharmony_ci .write = memory_low_write, 682862306a36Sopenharmony_ci }, 682962306a36Sopenharmony_ci { 683062306a36Sopenharmony_ci .name = "high", 683162306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 683262306a36Sopenharmony_ci .seq_show = memory_high_show, 683362306a36Sopenharmony_ci .write = memory_high_write, 683462306a36Sopenharmony_ci }, 683562306a36Sopenharmony_ci { 683662306a36Sopenharmony_ci .name = "max", 683762306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 683862306a36Sopenharmony_ci .seq_show = memory_max_show, 683962306a36Sopenharmony_ci .write = memory_max_write, 684062306a36Sopenharmony_ci }, 684162306a36Sopenharmony_ci { 684262306a36Sopenharmony_ci .name = "events", 684362306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 684462306a36Sopenharmony_ci .file_offset = offsetof(struct mem_cgroup, events_file), 684562306a36Sopenharmony_ci .seq_show = memory_events_show, 684662306a36Sopenharmony_ci }, 684762306a36Sopenharmony_ci { 684862306a36Sopenharmony_ci .name = "events.local", 684962306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 685062306a36Sopenharmony_ci .file_offset = offsetof(struct mem_cgroup, events_local_file), 685162306a36Sopenharmony_ci .seq_show = memory_events_local_show, 685262306a36Sopenharmony_ci }, 685362306a36Sopenharmony_ci { 685462306a36Sopenharmony_ci .name = "stat", 685562306a36Sopenharmony_ci .seq_show = memory_stat_show, 685662306a36Sopenharmony_ci }, 685762306a36Sopenharmony_ci#ifdef CONFIG_NUMA 685862306a36Sopenharmony_ci { 685962306a36Sopenharmony_ci .name = "numa_stat", 686062306a36Sopenharmony_ci .seq_show = memory_numa_stat_show, 686162306a36Sopenharmony_ci }, 686262306a36Sopenharmony_ci#endif 686362306a36Sopenharmony_ci { 686462306a36Sopenharmony_ci .name = "oom.group", 686562306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 686662306a36Sopenharmony_ci .seq_show = memory_oom_group_show, 686762306a36Sopenharmony_ci .write = memory_oom_group_write, 686862306a36Sopenharmony_ci }, 686962306a36Sopenharmony_ci { 687062306a36Sopenharmony_ci .name = "reclaim", 687162306a36Sopenharmony_ci .flags = CFTYPE_NS_DELEGATABLE, 687262306a36Sopenharmony_ci .write = memory_reclaim, 687362306a36Sopenharmony_ci }, 687462306a36Sopenharmony_ci { } /* terminate */ 687562306a36Sopenharmony_ci}; 687662306a36Sopenharmony_ci 687762306a36Sopenharmony_cistruct cgroup_subsys memory_cgrp_subsys = { 687862306a36Sopenharmony_ci .css_alloc = mem_cgroup_css_alloc, 687962306a36Sopenharmony_ci .css_online = mem_cgroup_css_online, 688062306a36Sopenharmony_ci .css_offline = mem_cgroup_css_offline, 688162306a36Sopenharmony_ci .css_released = mem_cgroup_css_released, 688262306a36Sopenharmony_ci .css_free = mem_cgroup_css_free, 688362306a36Sopenharmony_ci .css_reset = mem_cgroup_css_reset, 688462306a36Sopenharmony_ci .css_rstat_flush = mem_cgroup_css_rstat_flush, 688562306a36Sopenharmony_ci .can_attach = mem_cgroup_can_attach, 688662306a36Sopenharmony_ci .attach = mem_cgroup_attach, 688762306a36Sopenharmony_ci .cancel_attach = mem_cgroup_cancel_attach, 688862306a36Sopenharmony_ci .post_attach = mem_cgroup_move_task, 688962306a36Sopenharmony_ci .dfl_cftypes = memory_files, 689062306a36Sopenharmony_ci .legacy_cftypes = mem_cgroup_legacy_files, 689162306a36Sopenharmony_ci .early_init = 0, 689262306a36Sopenharmony_ci}; 689362306a36Sopenharmony_ci 689462306a36Sopenharmony_ci/* 689562306a36Sopenharmony_ci * This function calculates an individual cgroup's effective 689662306a36Sopenharmony_ci * protection which is derived from its own memory.min/low, its 689762306a36Sopenharmony_ci * parent's and siblings' settings, as well as the actual memory 689862306a36Sopenharmony_ci * distribution in the tree. 689962306a36Sopenharmony_ci * 690062306a36Sopenharmony_ci * The following rules apply to the effective protection values: 690162306a36Sopenharmony_ci * 690262306a36Sopenharmony_ci * 1. At the first level of reclaim, effective protection is equal to 690362306a36Sopenharmony_ci * the declared protection in memory.min and memory.low. 690462306a36Sopenharmony_ci * 690562306a36Sopenharmony_ci * 2. To enable safe delegation of the protection configuration, at 690662306a36Sopenharmony_ci * subsequent levels the effective protection is capped to the 690762306a36Sopenharmony_ci * parent's effective protection. 690862306a36Sopenharmony_ci * 690962306a36Sopenharmony_ci * 3. To make complex and dynamic subtrees easier to configure, the 691062306a36Sopenharmony_ci * user is allowed to overcommit the declared protection at a given 691162306a36Sopenharmony_ci * level. If that is the case, the parent's effective protection is 691262306a36Sopenharmony_ci * distributed to the children in proportion to how much protection 691362306a36Sopenharmony_ci * they have declared and how much of it they are utilizing. 691462306a36Sopenharmony_ci * 691562306a36Sopenharmony_ci * This makes distribution proportional, but also work-conserving: 691662306a36Sopenharmony_ci * if one cgroup claims much more protection than it uses memory, 691762306a36Sopenharmony_ci * the unused remainder is available to its siblings. 691862306a36Sopenharmony_ci * 691962306a36Sopenharmony_ci * 4. Conversely, when the declared protection is undercommitted at a 692062306a36Sopenharmony_ci * given level, the distribution of the larger parental protection 692162306a36Sopenharmony_ci * budget is NOT proportional. A cgroup's protection from a sibling 692262306a36Sopenharmony_ci * is capped to its own memory.min/low setting. 692362306a36Sopenharmony_ci * 692462306a36Sopenharmony_ci * 5. However, to allow protecting recursive subtrees from each other 692562306a36Sopenharmony_ci * without having to declare each individual cgroup's fixed share 692662306a36Sopenharmony_ci * of the ancestor's claim to protection, any unutilized - 692762306a36Sopenharmony_ci * "floating" - protection from up the tree is distributed in 692862306a36Sopenharmony_ci * proportion to each cgroup's *usage*. This makes the protection 692962306a36Sopenharmony_ci * neutral wrt sibling cgroups and lets them compete freely over 693062306a36Sopenharmony_ci * the shared parental protection budget, but it protects the 693162306a36Sopenharmony_ci * subtree as a whole from neighboring subtrees. 693262306a36Sopenharmony_ci * 693362306a36Sopenharmony_ci * Note that 4. and 5. are not in conflict: 4. is about protecting 693462306a36Sopenharmony_ci * against immediate siblings whereas 5. is about protecting against 693562306a36Sopenharmony_ci * neighboring subtrees. 693662306a36Sopenharmony_ci */ 693762306a36Sopenharmony_cistatic unsigned long effective_protection(unsigned long usage, 693862306a36Sopenharmony_ci unsigned long parent_usage, 693962306a36Sopenharmony_ci unsigned long setting, 694062306a36Sopenharmony_ci unsigned long parent_effective, 694162306a36Sopenharmony_ci unsigned long siblings_protected) 694262306a36Sopenharmony_ci{ 694362306a36Sopenharmony_ci unsigned long protected; 694462306a36Sopenharmony_ci unsigned long ep; 694562306a36Sopenharmony_ci 694662306a36Sopenharmony_ci protected = min(usage, setting); 694762306a36Sopenharmony_ci /* 694862306a36Sopenharmony_ci * If all cgroups at this level combined claim and use more 694962306a36Sopenharmony_ci * protection than what the parent affords them, distribute 695062306a36Sopenharmony_ci * shares in proportion to utilization. 695162306a36Sopenharmony_ci * 695262306a36Sopenharmony_ci * We are using actual utilization rather than the statically 695362306a36Sopenharmony_ci * claimed protection in order to be work-conserving: claimed 695462306a36Sopenharmony_ci * but unused protection is available to siblings that would 695562306a36Sopenharmony_ci * otherwise get a smaller chunk than what they claimed. 695662306a36Sopenharmony_ci */ 695762306a36Sopenharmony_ci if (siblings_protected > parent_effective) 695862306a36Sopenharmony_ci return protected * parent_effective / siblings_protected; 695962306a36Sopenharmony_ci 696062306a36Sopenharmony_ci /* 696162306a36Sopenharmony_ci * Ok, utilized protection of all children is within what the 696262306a36Sopenharmony_ci * parent affords them, so we know whatever this child claims 696362306a36Sopenharmony_ci * and utilizes is effectively protected. 696462306a36Sopenharmony_ci * 696562306a36Sopenharmony_ci * If there is unprotected usage beyond this value, reclaim 696662306a36Sopenharmony_ci * will apply pressure in proportion to that amount. 696762306a36Sopenharmony_ci * 696862306a36Sopenharmony_ci * If there is unutilized protection, the cgroup will be fully 696962306a36Sopenharmony_ci * shielded from reclaim, but we do return a smaller value for 697062306a36Sopenharmony_ci * protection than what the group could enjoy in theory. This 697162306a36Sopenharmony_ci * is okay. With the overcommit distribution above, effective 697262306a36Sopenharmony_ci * protection is always dependent on how memory is actually 697362306a36Sopenharmony_ci * consumed among the siblings anyway. 697462306a36Sopenharmony_ci */ 697562306a36Sopenharmony_ci ep = protected; 697662306a36Sopenharmony_ci 697762306a36Sopenharmony_ci /* 697862306a36Sopenharmony_ci * If the children aren't claiming (all of) the protection 697962306a36Sopenharmony_ci * afforded to them by the parent, distribute the remainder in 698062306a36Sopenharmony_ci * proportion to the (unprotected) memory of each cgroup. That 698162306a36Sopenharmony_ci * way, cgroups that aren't explicitly prioritized wrt each 698262306a36Sopenharmony_ci * other compete freely over the allowance, but they are 698362306a36Sopenharmony_ci * collectively protected from neighboring trees. 698462306a36Sopenharmony_ci * 698562306a36Sopenharmony_ci * We're using unprotected memory for the weight so that if 698662306a36Sopenharmony_ci * some cgroups DO claim explicit protection, we don't protect 698762306a36Sopenharmony_ci * the same bytes twice. 698862306a36Sopenharmony_ci * 698962306a36Sopenharmony_ci * Check both usage and parent_usage against the respective 699062306a36Sopenharmony_ci * protected values. One should imply the other, but they 699162306a36Sopenharmony_ci * aren't read atomically - make sure the division is sane. 699262306a36Sopenharmony_ci */ 699362306a36Sopenharmony_ci if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 699462306a36Sopenharmony_ci return ep; 699562306a36Sopenharmony_ci if (parent_effective > siblings_protected && 699662306a36Sopenharmony_ci parent_usage > siblings_protected && 699762306a36Sopenharmony_ci usage > protected) { 699862306a36Sopenharmony_ci unsigned long unclaimed; 699962306a36Sopenharmony_ci 700062306a36Sopenharmony_ci unclaimed = parent_effective - siblings_protected; 700162306a36Sopenharmony_ci unclaimed *= usage - protected; 700262306a36Sopenharmony_ci unclaimed /= parent_usage - siblings_protected; 700362306a36Sopenharmony_ci 700462306a36Sopenharmony_ci ep += unclaimed; 700562306a36Sopenharmony_ci } 700662306a36Sopenharmony_ci 700762306a36Sopenharmony_ci return ep; 700862306a36Sopenharmony_ci} 700962306a36Sopenharmony_ci 701062306a36Sopenharmony_ci/** 701162306a36Sopenharmony_ci * mem_cgroup_calculate_protection - check if memory consumption is in the normal range 701262306a36Sopenharmony_ci * @root: the top ancestor of the sub-tree being checked 701362306a36Sopenharmony_ci * @memcg: the memory cgroup to check 701462306a36Sopenharmony_ci * 701562306a36Sopenharmony_ci * WARNING: This function is not stateless! It can only be used as part 701662306a36Sopenharmony_ci * of a top-down tree iteration, not for isolated queries. 701762306a36Sopenharmony_ci */ 701862306a36Sopenharmony_civoid mem_cgroup_calculate_protection(struct mem_cgroup *root, 701962306a36Sopenharmony_ci struct mem_cgroup *memcg) 702062306a36Sopenharmony_ci{ 702162306a36Sopenharmony_ci unsigned long usage, parent_usage; 702262306a36Sopenharmony_ci struct mem_cgroup *parent; 702362306a36Sopenharmony_ci 702462306a36Sopenharmony_ci if (mem_cgroup_disabled()) 702562306a36Sopenharmony_ci return; 702662306a36Sopenharmony_ci 702762306a36Sopenharmony_ci if (!root) 702862306a36Sopenharmony_ci root = root_mem_cgroup; 702962306a36Sopenharmony_ci 703062306a36Sopenharmony_ci /* 703162306a36Sopenharmony_ci * Effective values of the reclaim targets are ignored so they 703262306a36Sopenharmony_ci * can be stale. Have a look at mem_cgroup_protection for more 703362306a36Sopenharmony_ci * details. 703462306a36Sopenharmony_ci * TODO: calculation should be more robust so that we do not need 703562306a36Sopenharmony_ci * that special casing. 703662306a36Sopenharmony_ci */ 703762306a36Sopenharmony_ci if (memcg == root) 703862306a36Sopenharmony_ci return; 703962306a36Sopenharmony_ci 704062306a36Sopenharmony_ci usage = page_counter_read(&memcg->memory); 704162306a36Sopenharmony_ci if (!usage) 704262306a36Sopenharmony_ci return; 704362306a36Sopenharmony_ci 704462306a36Sopenharmony_ci parent = parent_mem_cgroup(memcg); 704562306a36Sopenharmony_ci 704662306a36Sopenharmony_ci if (parent == root) { 704762306a36Sopenharmony_ci memcg->memory.emin = READ_ONCE(memcg->memory.min); 704862306a36Sopenharmony_ci memcg->memory.elow = READ_ONCE(memcg->memory.low); 704962306a36Sopenharmony_ci return; 705062306a36Sopenharmony_ci } 705162306a36Sopenharmony_ci 705262306a36Sopenharmony_ci parent_usage = page_counter_read(&parent->memory); 705362306a36Sopenharmony_ci 705462306a36Sopenharmony_ci WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 705562306a36Sopenharmony_ci READ_ONCE(memcg->memory.min), 705662306a36Sopenharmony_ci READ_ONCE(parent->memory.emin), 705762306a36Sopenharmony_ci atomic_long_read(&parent->memory.children_min_usage))); 705862306a36Sopenharmony_ci 705962306a36Sopenharmony_ci WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 706062306a36Sopenharmony_ci READ_ONCE(memcg->memory.low), 706162306a36Sopenharmony_ci READ_ONCE(parent->memory.elow), 706262306a36Sopenharmony_ci atomic_long_read(&parent->memory.children_low_usage))); 706362306a36Sopenharmony_ci} 706462306a36Sopenharmony_ci 706562306a36Sopenharmony_cistatic int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, 706662306a36Sopenharmony_ci gfp_t gfp) 706762306a36Sopenharmony_ci{ 706862306a36Sopenharmony_ci long nr_pages = folio_nr_pages(folio); 706962306a36Sopenharmony_ci int ret; 707062306a36Sopenharmony_ci 707162306a36Sopenharmony_ci ret = try_charge(memcg, gfp, nr_pages); 707262306a36Sopenharmony_ci if (ret) 707362306a36Sopenharmony_ci goto out; 707462306a36Sopenharmony_ci 707562306a36Sopenharmony_ci css_get(&memcg->css); 707662306a36Sopenharmony_ci commit_charge(folio, memcg); 707762306a36Sopenharmony_ci 707862306a36Sopenharmony_ci local_irq_disable(); 707962306a36Sopenharmony_ci mem_cgroup_charge_statistics(memcg, nr_pages); 708062306a36Sopenharmony_ci memcg_check_events(memcg, folio_nid(folio)); 708162306a36Sopenharmony_ci local_irq_enable(); 708262306a36Sopenharmony_ciout: 708362306a36Sopenharmony_ci return ret; 708462306a36Sopenharmony_ci} 708562306a36Sopenharmony_ci 708662306a36Sopenharmony_ciint __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) 708762306a36Sopenharmony_ci{ 708862306a36Sopenharmony_ci struct mem_cgroup *memcg; 708962306a36Sopenharmony_ci int ret; 709062306a36Sopenharmony_ci 709162306a36Sopenharmony_ci memcg = get_mem_cgroup_from_mm(mm); 709262306a36Sopenharmony_ci ret = charge_memcg(folio, memcg, gfp); 709362306a36Sopenharmony_ci css_put(&memcg->css); 709462306a36Sopenharmony_ci 709562306a36Sopenharmony_ci return ret; 709662306a36Sopenharmony_ci} 709762306a36Sopenharmony_ci 709862306a36Sopenharmony_ci/** 709962306a36Sopenharmony_ci * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. 710062306a36Sopenharmony_ci * @folio: folio to charge. 710162306a36Sopenharmony_ci * @mm: mm context of the victim 710262306a36Sopenharmony_ci * @gfp: reclaim mode 710362306a36Sopenharmony_ci * @entry: swap entry for which the folio is allocated 710462306a36Sopenharmony_ci * 710562306a36Sopenharmony_ci * This function charges a folio allocated for swapin. Please call this before 710662306a36Sopenharmony_ci * adding the folio to the swapcache. 710762306a36Sopenharmony_ci * 710862306a36Sopenharmony_ci * Returns 0 on success. Otherwise, an error code is returned. 710962306a36Sopenharmony_ci */ 711062306a36Sopenharmony_ciint mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, 711162306a36Sopenharmony_ci gfp_t gfp, swp_entry_t entry) 711262306a36Sopenharmony_ci{ 711362306a36Sopenharmony_ci struct mem_cgroup *memcg; 711462306a36Sopenharmony_ci unsigned short id; 711562306a36Sopenharmony_ci int ret; 711662306a36Sopenharmony_ci 711762306a36Sopenharmony_ci if (mem_cgroup_disabled()) 711862306a36Sopenharmony_ci return 0; 711962306a36Sopenharmony_ci 712062306a36Sopenharmony_ci id = lookup_swap_cgroup_id(entry); 712162306a36Sopenharmony_ci rcu_read_lock(); 712262306a36Sopenharmony_ci memcg = mem_cgroup_from_id(id); 712362306a36Sopenharmony_ci if (!memcg || !css_tryget_online(&memcg->css)) 712462306a36Sopenharmony_ci memcg = get_mem_cgroup_from_mm(mm); 712562306a36Sopenharmony_ci rcu_read_unlock(); 712662306a36Sopenharmony_ci 712762306a36Sopenharmony_ci ret = charge_memcg(folio, memcg, gfp); 712862306a36Sopenharmony_ci 712962306a36Sopenharmony_ci css_put(&memcg->css); 713062306a36Sopenharmony_ci return ret; 713162306a36Sopenharmony_ci} 713262306a36Sopenharmony_ci 713362306a36Sopenharmony_ci/* 713462306a36Sopenharmony_ci * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 713562306a36Sopenharmony_ci * @entry: swap entry for which the page is charged 713662306a36Sopenharmony_ci * 713762306a36Sopenharmony_ci * Call this function after successfully adding the charged page to swapcache. 713862306a36Sopenharmony_ci * 713962306a36Sopenharmony_ci * Note: This function assumes the page for which swap slot is being uncharged 714062306a36Sopenharmony_ci * is order 0 page. 714162306a36Sopenharmony_ci */ 714262306a36Sopenharmony_civoid mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 714362306a36Sopenharmony_ci{ 714462306a36Sopenharmony_ci /* 714562306a36Sopenharmony_ci * Cgroup1's unified memory+swap counter has been charged with the 714662306a36Sopenharmony_ci * new swapcache page, finish the transfer by uncharging the swap 714762306a36Sopenharmony_ci * slot. The swap slot would also get uncharged when it dies, but 714862306a36Sopenharmony_ci * it can stick around indefinitely and we'd count the page twice 714962306a36Sopenharmony_ci * the entire time. 715062306a36Sopenharmony_ci * 715162306a36Sopenharmony_ci * Cgroup2 has separate resource counters for memory and swap, 715262306a36Sopenharmony_ci * so this is a non-issue here. Memory and swap charge lifetimes 715362306a36Sopenharmony_ci * correspond 1:1 to page and swap slot lifetimes: we charge the 715462306a36Sopenharmony_ci * page to memory here, and uncharge swap when the slot is freed. 715562306a36Sopenharmony_ci */ 715662306a36Sopenharmony_ci if (!mem_cgroup_disabled() && do_memsw_account()) { 715762306a36Sopenharmony_ci /* 715862306a36Sopenharmony_ci * The swap entry might not get freed for a long time, 715962306a36Sopenharmony_ci * let's not wait for it. The page already received a 716062306a36Sopenharmony_ci * memory+swap charge, drop the swap entry duplicate. 716162306a36Sopenharmony_ci */ 716262306a36Sopenharmony_ci mem_cgroup_uncharge_swap(entry, 1); 716362306a36Sopenharmony_ci } 716462306a36Sopenharmony_ci} 716562306a36Sopenharmony_ci 716662306a36Sopenharmony_cistruct uncharge_gather { 716762306a36Sopenharmony_ci struct mem_cgroup *memcg; 716862306a36Sopenharmony_ci unsigned long nr_memory; 716962306a36Sopenharmony_ci unsigned long pgpgout; 717062306a36Sopenharmony_ci unsigned long nr_kmem; 717162306a36Sopenharmony_ci int nid; 717262306a36Sopenharmony_ci}; 717362306a36Sopenharmony_ci 717462306a36Sopenharmony_cistatic inline void uncharge_gather_clear(struct uncharge_gather *ug) 717562306a36Sopenharmony_ci{ 717662306a36Sopenharmony_ci memset(ug, 0, sizeof(*ug)); 717762306a36Sopenharmony_ci} 717862306a36Sopenharmony_ci 717962306a36Sopenharmony_cistatic void uncharge_batch(const struct uncharge_gather *ug) 718062306a36Sopenharmony_ci{ 718162306a36Sopenharmony_ci unsigned long flags; 718262306a36Sopenharmony_ci 718362306a36Sopenharmony_ci if (ug->nr_memory) { 718462306a36Sopenharmony_ci page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 718562306a36Sopenharmony_ci if (do_memsw_account()) 718662306a36Sopenharmony_ci page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 718762306a36Sopenharmony_ci if (ug->nr_kmem) 718862306a36Sopenharmony_ci memcg_account_kmem(ug->memcg, -ug->nr_kmem); 718962306a36Sopenharmony_ci memcg_oom_recover(ug->memcg); 719062306a36Sopenharmony_ci } 719162306a36Sopenharmony_ci 719262306a36Sopenharmony_ci local_irq_save(flags); 719362306a36Sopenharmony_ci __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 719462306a36Sopenharmony_ci __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 719562306a36Sopenharmony_ci memcg_check_events(ug->memcg, ug->nid); 719662306a36Sopenharmony_ci local_irq_restore(flags); 719762306a36Sopenharmony_ci 719862306a36Sopenharmony_ci /* drop reference from uncharge_folio */ 719962306a36Sopenharmony_ci css_put(&ug->memcg->css); 720062306a36Sopenharmony_ci} 720162306a36Sopenharmony_ci 720262306a36Sopenharmony_cistatic void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) 720362306a36Sopenharmony_ci{ 720462306a36Sopenharmony_ci long nr_pages; 720562306a36Sopenharmony_ci struct mem_cgroup *memcg; 720662306a36Sopenharmony_ci struct obj_cgroup *objcg; 720762306a36Sopenharmony_ci 720862306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 720962306a36Sopenharmony_ci 721062306a36Sopenharmony_ci /* 721162306a36Sopenharmony_ci * Nobody should be changing or seriously looking at 721262306a36Sopenharmony_ci * folio memcg or objcg at this point, we have fully 721362306a36Sopenharmony_ci * exclusive access to the folio. 721462306a36Sopenharmony_ci */ 721562306a36Sopenharmony_ci if (folio_memcg_kmem(folio)) { 721662306a36Sopenharmony_ci objcg = __folio_objcg(folio); 721762306a36Sopenharmony_ci /* 721862306a36Sopenharmony_ci * This get matches the put at the end of the function and 721962306a36Sopenharmony_ci * kmem pages do not hold memcg references anymore. 722062306a36Sopenharmony_ci */ 722162306a36Sopenharmony_ci memcg = get_mem_cgroup_from_objcg(objcg); 722262306a36Sopenharmony_ci } else { 722362306a36Sopenharmony_ci memcg = __folio_memcg(folio); 722462306a36Sopenharmony_ci } 722562306a36Sopenharmony_ci 722662306a36Sopenharmony_ci if (!memcg) 722762306a36Sopenharmony_ci return; 722862306a36Sopenharmony_ci 722962306a36Sopenharmony_ci if (ug->memcg != memcg) { 723062306a36Sopenharmony_ci if (ug->memcg) { 723162306a36Sopenharmony_ci uncharge_batch(ug); 723262306a36Sopenharmony_ci uncharge_gather_clear(ug); 723362306a36Sopenharmony_ci } 723462306a36Sopenharmony_ci ug->memcg = memcg; 723562306a36Sopenharmony_ci ug->nid = folio_nid(folio); 723662306a36Sopenharmony_ci 723762306a36Sopenharmony_ci /* pairs with css_put in uncharge_batch */ 723862306a36Sopenharmony_ci css_get(&memcg->css); 723962306a36Sopenharmony_ci } 724062306a36Sopenharmony_ci 724162306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 724262306a36Sopenharmony_ci 724362306a36Sopenharmony_ci if (folio_memcg_kmem(folio)) { 724462306a36Sopenharmony_ci ug->nr_memory += nr_pages; 724562306a36Sopenharmony_ci ug->nr_kmem += nr_pages; 724662306a36Sopenharmony_ci 724762306a36Sopenharmony_ci folio->memcg_data = 0; 724862306a36Sopenharmony_ci obj_cgroup_put(objcg); 724962306a36Sopenharmony_ci } else { 725062306a36Sopenharmony_ci /* LRU pages aren't accounted at the root level */ 725162306a36Sopenharmony_ci if (!mem_cgroup_is_root(memcg)) 725262306a36Sopenharmony_ci ug->nr_memory += nr_pages; 725362306a36Sopenharmony_ci ug->pgpgout++; 725462306a36Sopenharmony_ci 725562306a36Sopenharmony_ci folio->memcg_data = 0; 725662306a36Sopenharmony_ci } 725762306a36Sopenharmony_ci 725862306a36Sopenharmony_ci css_put(&memcg->css); 725962306a36Sopenharmony_ci} 726062306a36Sopenharmony_ci 726162306a36Sopenharmony_civoid __mem_cgroup_uncharge(struct folio *folio) 726262306a36Sopenharmony_ci{ 726362306a36Sopenharmony_ci struct uncharge_gather ug; 726462306a36Sopenharmony_ci 726562306a36Sopenharmony_ci /* Don't touch folio->lru of any random page, pre-check: */ 726662306a36Sopenharmony_ci if (!folio_memcg(folio)) 726762306a36Sopenharmony_ci return; 726862306a36Sopenharmony_ci 726962306a36Sopenharmony_ci uncharge_gather_clear(&ug); 727062306a36Sopenharmony_ci uncharge_folio(folio, &ug); 727162306a36Sopenharmony_ci uncharge_batch(&ug); 727262306a36Sopenharmony_ci} 727362306a36Sopenharmony_ci 727462306a36Sopenharmony_ci/** 727562306a36Sopenharmony_ci * __mem_cgroup_uncharge_list - uncharge a list of page 727662306a36Sopenharmony_ci * @page_list: list of pages to uncharge 727762306a36Sopenharmony_ci * 727862306a36Sopenharmony_ci * Uncharge a list of pages previously charged with 727962306a36Sopenharmony_ci * __mem_cgroup_charge(). 728062306a36Sopenharmony_ci */ 728162306a36Sopenharmony_civoid __mem_cgroup_uncharge_list(struct list_head *page_list) 728262306a36Sopenharmony_ci{ 728362306a36Sopenharmony_ci struct uncharge_gather ug; 728462306a36Sopenharmony_ci struct folio *folio; 728562306a36Sopenharmony_ci 728662306a36Sopenharmony_ci uncharge_gather_clear(&ug); 728762306a36Sopenharmony_ci list_for_each_entry(folio, page_list, lru) 728862306a36Sopenharmony_ci uncharge_folio(folio, &ug); 728962306a36Sopenharmony_ci if (ug.memcg) 729062306a36Sopenharmony_ci uncharge_batch(&ug); 729162306a36Sopenharmony_ci} 729262306a36Sopenharmony_ci 729362306a36Sopenharmony_ci/** 729462306a36Sopenharmony_ci * mem_cgroup_migrate - Charge a folio's replacement. 729562306a36Sopenharmony_ci * @old: Currently circulating folio. 729662306a36Sopenharmony_ci * @new: Replacement folio. 729762306a36Sopenharmony_ci * 729862306a36Sopenharmony_ci * Charge @new as a replacement folio for @old. @old will 729962306a36Sopenharmony_ci * be uncharged upon free. 730062306a36Sopenharmony_ci * 730162306a36Sopenharmony_ci * Both folios must be locked, @new->mapping must be set up. 730262306a36Sopenharmony_ci */ 730362306a36Sopenharmony_civoid mem_cgroup_migrate(struct folio *old, struct folio *new) 730462306a36Sopenharmony_ci{ 730562306a36Sopenharmony_ci struct mem_cgroup *memcg; 730662306a36Sopenharmony_ci long nr_pages = folio_nr_pages(new); 730762306a36Sopenharmony_ci unsigned long flags; 730862306a36Sopenharmony_ci 730962306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 731062306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 731162306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 731262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); 731362306a36Sopenharmony_ci 731462306a36Sopenharmony_ci if (mem_cgroup_disabled()) 731562306a36Sopenharmony_ci return; 731662306a36Sopenharmony_ci 731762306a36Sopenharmony_ci /* Page cache replacement: new folio already charged? */ 731862306a36Sopenharmony_ci if (folio_memcg(new)) 731962306a36Sopenharmony_ci return; 732062306a36Sopenharmony_ci 732162306a36Sopenharmony_ci memcg = folio_memcg(old); 732262306a36Sopenharmony_ci VM_WARN_ON_ONCE_FOLIO(!memcg, old); 732362306a36Sopenharmony_ci if (!memcg) 732462306a36Sopenharmony_ci return; 732562306a36Sopenharmony_ci 732662306a36Sopenharmony_ci /* Force-charge the new page. The old one will be freed soon */ 732762306a36Sopenharmony_ci if (!mem_cgroup_is_root(memcg)) { 732862306a36Sopenharmony_ci page_counter_charge(&memcg->memory, nr_pages); 732962306a36Sopenharmony_ci if (do_memsw_account()) 733062306a36Sopenharmony_ci page_counter_charge(&memcg->memsw, nr_pages); 733162306a36Sopenharmony_ci } 733262306a36Sopenharmony_ci 733362306a36Sopenharmony_ci css_get(&memcg->css); 733462306a36Sopenharmony_ci commit_charge(new, memcg); 733562306a36Sopenharmony_ci 733662306a36Sopenharmony_ci local_irq_save(flags); 733762306a36Sopenharmony_ci mem_cgroup_charge_statistics(memcg, nr_pages); 733862306a36Sopenharmony_ci memcg_check_events(memcg, folio_nid(new)); 733962306a36Sopenharmony_ci local_irq_restore(flags); 734062306a36Sopenharmony_ci} 734162306a36Sopenharmony_ci 734262306a36Sopenharmony_ciDEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 734362306a36Sopenharmony_ciEXPORT_SYMBOL(memcg_sockets_enabled_key); 734462306a36Sopenharmony_ci 734562306a36Sopenharmony_civoid mem_cgroup_sk_alloc(struct sock *sk) 734662306a36Sopenharmony_ci{ 734762306a36Sopenharmony_ci struct mem_cgroup *memcg; 734862306a36Sopenharmony_ci 734962306a36Sopenharmony_ci if (!mem_cgroup_sockets_enabled) 735062306a36Sopenharmony_ci return; 735162306a36Sopenharmony_ci 735262306a36Sopenharmony_ci /* Do not associate the sock with unrelated interrupted task's memcg. */ 735362306a36Sopenharmony_ci if (!in_task()) 735462306a36Sopenharmony_ci return; 735562306a36Sopenharmony_ci 735662306a36Sopenharmony_ci rcu_read_lock(); 735762306a36Sopenharmony_ci memcg = mem_cgroup_from_task(current); 735862306a36Sopenharmony_ci if (mem_cgroup_is_root(memcg)) 735962306a36Sopenharmony_ci goto out; 736062306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 736162306a36Sopenharmony_ci goto out; 736262306a36Sopenharmony_ci if (css_tryget(&memcg->css)) 736362306a36Sopenharmony_ci sk->sk_memcg = memcg; 736462306a36Sopenharmony_ciout: 736562306a36Sopenharmony_ci rcu_read_unlock(); 736662306a36Sopenharmony_ci} 736762306a36Sopenharmony_ci 736862306a36Sopenharmony_civoid mem_cgroup_sk_free(struct sock *sk) 736962306a36Sopenharmony_ci{ 737062306a36Sopenharmony_ci if (sk->sk_memcg) 737162306a36Sopenharmony_ci css_put(&sk->sk_memcg->css); 737262306a36Sopenharmony_ci} 737362306a36Sopenharmony_ci 737462306a36Sopenharmony_ci/** 737562306a36Sopenharmony_ci * mem_cgroup_charge_skmem - charge socket memory 737662306a36Sopenharmony_ci * @memcg: memcg to charge 737762306a36Sopenharmony_ci * @nr_pages: number of pages to charge 737862306a36Sopenharmony_ci * @gfp_mask: reclaim mode 737962306a36Sopenharmony_ci * 738062306a36Sopenharmony_ci * Charges @nr_pages to @memcg. Returns %true if the charge fit within 738162306a36Sopenharmony_ci * @memcg's configured limit, %false if it doesn't. 738262306a36Sopenharmony_ci */ 738362306a36Sopenharmony_cibool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 738462306a36Sopenharmony_ci gfp_t gfp_mask) 738562306a36Sopenharmony_ci{ 738662306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 738762306a36Sopenharmony_ci struct page_counter *fail; 738862306a36Sopenharmony_ci 738962306a36Sopenharmony_ci if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 739062306a36Sopenharmony_ci memcg->tcpmem_pressure = 0; 739162306a36Sopenharmony_ci return true; 739262306a36Sopenharmony_ci } 739362306a36Sopenharmony_ci memcg->tcpmem_pressure = 1; 739462306a36Sopenharmony_ci if (gfp_mask & __GFP_NOFAIL) { 739562306a36Sopenharmony_ci page_counter_charge(&memcg->tcpmem, nr_pages); 739662306a36Sopenharmony_ci return true; 739762306a36Sopenharmony_ci } 739862306a36Sopenharmony_ci return false; 739962306a36Sopenharmony_ci } 740062306a36Sopenharmony_ci 740162306a36Sopenharmony_ci if (try_charge(memcg, gfp_mask, nr_pages) == 0) { 740262306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 740362306a36Sopenharmony_ci return true; 740462306a36Sopenharmony_ci } 740562306a36Sopenharmony_ci 740662306a36Sopenharmony_ci return false; 740762306a36Sopenharmony_ci} 740862306a36Sopenharmony_ci 740962306a36Sopenharmony_ci/** 741062306a36Sopenharmony_ci * mem_cgroup_uncharge_skmem - uncharge socket memory 741162306a36Sopenharmony_ci * @memcg: memcg to uncharge 741262306a36Sopenharmony_ci * @nr_pages: number of pages to uncharge 741362306a36Sopenharmony_ci */ 741462306a36Sopenharmony_civoid mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 741562306a36Sopenharmony_ci{ 741662306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 741762306a36Sopenharmony_ci page_counter_uncharge(&memcg->tcpmem, nr_pages); 741862306a36Sopenharmony_ci return; 741962306a36Sopenharmony_ci } 742062306a36Sopenharmony_ci 742162306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 742262306a36Sopenharmony_ci 742362306a36Sopenharmony_ci refill_stock(memcg, nr_pages); 742462306a36Sopenharmony_ci} 742562306a36Sopenharmony_ci 742662306a36Sopenharmony_cistatic int __init cgroup_memory(char *s) 742762306a36Sopenharmony_ci{ 742862306a36Sopenharmony_ci char *token; 742962306a36Sopenharmony_ci 743062306a36Sopenharmony_ci while ((token = strsep(&s, ",")) != NULL) { 743162306a36Sopenharmony_ci if (!*token) 743262306a36Sopenharmony_ci continue; 743362306a36Sopenharmony_ci if (!strcmp(token, "nosocket")) 743462306a36Sopenharmony_ci cgroup_memory_nosocket = true; 743562306a36Sopenharmony_ci if (!strcmp(token, "nokmem")) 743662306a36Sopenharmony_ci cgroup_memory_nokmem = true; 743762306a36Sopenharmony_ci if (!strcmp(token, "nobpf")) 743862306a36Sopenharmony_ci cgroup_memory_nobpf = true; 743962306a36Sopenharmony_ci if (!strcmp(token, "kmem")) 744062306a36Sopenharmony_ci cgroup_memory_nokmem = false; 744162306a36Sopenharmony_ci } 744262306a36Sopenharmony_ci return 1; 744362306a36Sopenharmony_ci} 744462306a36Sopenharmony_ci__setup("cgroup.memory=", cgroup_memory); 744562306a36Sopenharmony_ci 744662306a36Sopenharmony_ci/* 744762306a36Sopenharmony_ci * subsys_initcall() for memory controller. 744862306a36Sopenharmony_ci * 744962306a36Sopenharmony_ci * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 745062306a36Sopenharmony_ci * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 745162306a36Sopenharmony_ci * basically everything that doesn't depend on a specific mem_cgroup structure 745262306a36Sopenharmony_ci * should be initialized from here. 745362306a36Sopenharmony_ci */ 745462306a36Sopenharmony_cistatic int __init mem_cgroup_init(void) 745562306a36Sopenharmony_ci{ 745662306a36Sopenharmony_ci int cpu, node; 745762306a36Sopenharmony_ci 745862306a36Sopenharmony_ci /* 745962306a36Sopenharmony_ci * Currently s32 type (can refer to struct batched_lruvec_stat) is 746062306a36Sopenharmony_ci * used for per-memcg-per-cpu caching of per-node statistics. In order 746162306a36Sopenharmony_ci * to work fine, we should make sure that the overfill threshold can't 746262306a36Sopenharmony_ci * exceed S32_MAX / PAGE_SIZE. 746362306a36Sopenharmony_ci */ 746462306a36Sopenharmony_ci BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 746562306a36Sopenharmony_ci 746662306a36Sopenharmony_ci cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 746762306a36Sopenharmony_ci memcg_hotplug_cpu_dead); 746862306a36Sopenharmony_ci 746962306a36Sopenharmony_ci for_each_possible_cpu(cpu) 747062306a36Sopenharmony_ci INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 747162306a36Sopenharmony_ci drain_local_stock); 747262306a36Sopenharmony_ci 747362306a36Sopenharmony_ci for_each_node(node) { 747462306a36Sopenharmony_ci struct mem_cgroup_tree_per_node *rtpn; 747562306a36Sopenharmony_ci 747662306a36Sopenharmony_ci rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 747762306a36Sopenharmony_ci 747862306a36Sopenharmony_ci rtpn->rb_root = RB_ROOT; 747962306a36Sopenharmony_ci rtpn->rb_rightmost = NULL; 748062306a36Sopenharmony_ci spin_lock_init(&rtpn->lock); 748162306a36Sopenharmony_ci soft_limit_tree.rb_tree_per_node[node] = rtpn; 748262306a36Sopenharmony_ci } 748362306a36Sopenharmony_ci 748462306a36Sopenharmony_ci return 0; 748562306a36Sopenharmony_ci} 748662306a36Sopenharmony_cisubsys_initcall(mem_cgroup_init); 748762306a36Sopenharmony_ci 748862306a36Sopenharmony_ci#ifdef CONFIG_SWAP 748962306a36Sopenharmony_cistatic struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 749062306a36Sopenharmony_ci{ 749162306a36Sopenharmony_ci while (!refcount_inc_not_zero(&memcg->id.ref)) { 749262306a36Sopenharmony_ci /* 749362306a36Sopenharmony_ci * The root cgroup cannot be destroyed, so it's refcount must 749462306a36Sopenharmony_ci * always be >= 1. 749562306a36Sopenharmony_ci */ 749662306a36Sopenharmony_ci if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { 749762306a36Sopenharmony_ci VM_BUG_ON(1); 749862306a36Sopenharmony_ci break; 749962306a36Sopenharmony_ci } 750062306a36Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 750162306a36Sopenharmony_ci if (!memcg) 750262306a36Sopenharmony_ci memcg = root_mem_cgroup; 750362306a36Sopenharmony_ci } 750462306a36Sopenharmony_ci return memcg; 750562306a36Sopenharmony_ci} 750662306a36Sopenharmony_ci 750762306a36Sopenharmony_ci/** 750862306a36Sopenharmony_ci * mem_cgroup_swapout - transfer a memsw charge to swap 750962306a36Sopenharmony_ci * @folio: folio whose memsw charge to transfer 751062306a36Sopenharmony_ci * @entry: swap entry to move the charge to 751162306a36Sopenharmony_ci * 751262306a36Sopenharmony_ci * Transfer the memsw charge of @folio to @entry. 751362306a36Sopenharmony_ci */ 751462306a36Sopenharmony_civoid mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 751562306a36Sopenharmony_ci{ 751662306a36Sopenharmony_ci struct mem_cgroup *memcg, *swap_memcg; 751762306a36Sopenharmony_ci unsigned int nr_entries; 751862306a36Sopenharmony_ci unsigned short oldid; 751962306a36Sopenharmony_ci 752062306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 752162306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 752262306a36Sopenharmony_ci 752362306a36Sopenharmony_ci if (mem_cgroup_disabled()) 752462306a36Sopenharmony_ci return; 752562306a36Sopenharmony_ci 752662306a36Sopenharmony_ci if (!do_memsw_account()) 752762306a36Sopenharmony_ci return; 752862306a36Sopenharmony_ci 752962306a36Sopenharmony_ci memcg = folio_memcg(folio); 753062306a36Sopenharmony_ci 753162306a36Sopenharmony_ci VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 753262306a36Sopenharmony_ci if (!memcg) 753362306a36Sopenharmony_ci return; 753462306a36Sopenharmony_ci 753562306a36Sopenharmony_ci /* 753662306a36Sopenharmony_ci * In case the memcg owning these pages has been offlined and doesn't 753762306a36Sopenharmony_ci * have an ID allocated to it anymore, charge the closest online 753862306a36Sopenharmony_ci * ancestor for the swap instead and transfer the memory+swap charge. 753962306a36Sopenharmony_ci */ 754062306a36Sopenharmony_ci swap_memcg = mem_cgroup_id_get_online(memcg); 754162306a36Sopenharmony_ci nr_entries = folio_nr_pages(folio); 754262306a36Sopenharmony_ci /* Get references for the tail pages, too */ 754362306a36Sopenharmony_ci if (nr_entries > 1) 754462306a36Sopenharmony_ci mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 754562306a36Sopenharmony_ci oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 754662306a36Sopenharmony_ci nr_entries); 754762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(oldid, folio); 754862306a36Sopenharmony_ci mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 754962306a36Sopenharmony_ci 755062306a36Sopenharmony_ci folio->memcg_data = 0; 755162306a36Sopenharmony_ci 755262306a36Sopenharmony_ci if (!mem_cgroup_is_root(memcg)) 755362306a36Sopenharmony_ci page_counter_uncharge(&memcg->memory, nr_entries); 755462306a36Sopenharmony_ci 755562306a36Sopenharmony_ci if (memcg != swap_memcg) { 755662306a36Sopenharmony_ci if (!mem_cgroup_is_root(swap_memcg)) 755762306a36Sopenharmony_ci page_counter_charge(&swap_memcg->memsw, nr_entries); 755862306a36Sopenharmony_ci page_counter_uncharge(&memcg->memsw, nr_entries); 755962306a36Sopenharmony_ci } 756062306a36Sopenharmony_ci 756162306a36Sopenharmony_ci /* 756262306a36Sopenharmony_ci * Interrupts should be disabled here because the caller holds the 756362306a36Sopenharmony_ci * i_pages lock which is taken with interrupts-off. It is 756462306a36Sopenharmony_ci * important here to have the interrupts disabled because it is the 756562306a36Sopenharmony_ci * only synchronisation we have for updating the per-CPU variables. 756662306a36Sopenharmony_ci */ 756762306a36Sopenharmony_ci memcg_stats_lock(); 756862306a36Sopenharmony_ci mem_cgroup_charge_statistics(memcg, -nr_entries); 756962306a36Sopenharmony_ci memcg_stats_unlock(); 757062306a36Sopenharmony_ci memcg_check_events(memcg, folio_nid(folio)); 757162306a36Sopenharmony_ci 757262306a36Sopenharmony_ci css_put(&memcg->css); 757362306a36Sopenharmony_ci} 757462306a36Sopenharmony_ci 757562306a36Sopenharmony_ci/** 757662306a36Sopenharmony_ci * __mem_cgroup_try_charge_swap - try charging swap space for a folio 757762306a36Sopenharmony_ci * @folio: folio being added to swap 757862306a36Sopenharmony_ci * @entry: swap entry to charge 757962306a36Sopenharmony_ci * 758062306a36Sopenharmony_ci * Try to charge @folio's memcg for the swap space at @entry. 758162306a36Sopenharmony_ci * 758262306a36Sopenharmony_ci * Returns 0 on success, -ENOMEM on failure. 758362306a36Sopenharmony_ci */ 758462306a36Sopenharmony_ciint __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) 758562306a36Sopenharmony_ci{ 758662306a36Sopenharmony_ci unsigned int nr_pages = folio_nr_pages(folio); 758762306a36Sopenharmony_ci struct page_counter *counter; 758862306a36Sopenharmony_ci struct mem_cgroup *memcg; 758962306a36Sopenharmony_ci unsigned short oldid; 759062306a36Sopenharmony_ci 759162306a36Sopenharmony_ci if (do_memsw_account()) 759262306a36Sopenharmony_ci return 0; 759362306a36Sopenharmony_ci 759462306a36Sopenharmony_ci memcg = folio_memcg(folio); 759562306a36Sopenharmony_ci 759662306a36Sopenharmony_ci VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 759762306a36Sopenharmony_ci if (!memcg) 759862306a36Sopenharmony_ci return 0; 759962306a36Sopenharmony_ci 760062306a36Sopenharmony_ci if (!entry.val) { 760162306a36Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 760262306a36Sopenharmony_ci return 0; 760362306a36Sopenharmony_ci } 760462306a36Sopenharmony_ci 760562306a36Sopenharmony_ci memcg = mem_cgroup_id_get_online(memcg); 760662306a36Sopenharmony_ci 760762306a36Sopenharmony_ci if (!mem_cgroup_is_root(memcg) && 760862306a36Sopenharmony_ci !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 760962306a36Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_MAX); 761062306a36Sopenharmony_ci memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 761162306a36Sopenharmony_ci mem_cgroup_id_put(memcg); 761262306a36Sopenharmony_ci return -ENOMEM; 761362306a36Sopenharmony_ci } 761462306a36Sopenharmony_ci 761562306a36Sopenharmony_ci /* Get references for the tail pages, too */ 761662306a36Sopenharmony_ci if (nr_pages > 1) 761762306a36Sopenharmony_ci mem_cgroup_id_get_many(memcg, nr_pages - 1); 761862306a36Sopenharmony_ci oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 761962306a36Sopenharmony_ci VM_BUG_ON_FOLIO(oldid, folio); 762062306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 762162306a36Sopenharmony_ci 762262306a36Sopenharmony_ci return 0; 762362306a36Sopenharmony_ci} 762462306a36Sopenharmony_ci 762562306a36Sopenharmony_ci/** 762662306a36Sopenharmony_ci * __mem_cgroup_uncharge_swap - uncharge swap space 762762306a36Sopenharmony_ci * @entry: swap entry to uncharge 762862306a36Sopenharmony_ci * @nr_pages: the amount of swap space to uncharge 762962306a36Sopenharmony_ci */ 763062306a36Sopenharmony_civoid __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 763162306a36Sopenharmony_ci{ 763262306a36Sopenharmony_ci struct mem_cgroup *memcg; 763362306a36Sopenharmony_ci unsigned short id; 763462306a36Sopenharmony_ci 763562306a36Sopenharmony_ci id = swap_cgroup_record(entry, 0, nr_pages); 763662306a36Sopenharmony_ci rcu_read_lock(); 763762306a36Sopenharmony_ci memcg = mem_cgroup_from_id(id); 763862306a36Sopenharmony_ci if (memcg) { 763962306a36Sopenharmony_ci if (!mem_cgroup_is_root(memcg)) { 764062306a36Sopenharmony_ci if (do_memsw_account()) 764162306a36Sopenharmony_ci page_counter_uncharge(&memcg->memsw, nr_pages); 764262306a36Sopenharmony_ci else 764362306a36Sopenharmony_ci page_counter_uncharge(&memcg->swap, nr_pages); 764462306a36Sopenharmony_ci } 764562306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 764662306a36Sopenharmony_ci mem_cgroup_id_put_many(memcg, nr_pages); 764762306a36Sopenharmony_ci } 764862306a36Sopenharmony_ci rcu_read_unlock(); 764962306a36Sopenharmony_ci} 765062306a36Sopenharmony_ci 765162306a36Sopenharmony_cilong mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 765262306a36Sopenharmony_ci{ 765362306a36Sopenharmony_ci long nr_swap_pages = get_nr_swap_pages(); 765462306a36Sopenharmony_ci 765562306a36Sopenharmony_ci if (mem_cgroup_disabled() || do_memsw_account()) 765662306a36Sopenharmony_ci return nr_swap_pages; 765762306a36Sopenharmony_ci for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) 765862306a36Sopenharmony_ci nr_swap_pages = min_t(long, nr_swap_pages, 765962306a36Sopenharmony_ci READ_ONCE(memcg->swap.max) - 766062306a36Sopenharmony_ci page_counter_read(&memcg->swap)); 766162306a36Sopenharmony_ci return nr_swap_pages; 766262306a36Sopenharmony_ci} 766362306a36Sopenharmony_ci 766462306a36Sopenharmony_cibool mem_cgroup_swap_full(struct folio *folio) 766562306a36Sopenharmony_ci{ 766662306a36Sopenharmony_ci struct mem_cgroup *memcg; 766762306a36Sopenharmony_ci 766862306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 766962306a36Sopenharmony_ci 767062306a36Sopenharmony_ci if (vm_swap_full()) 767162306a36Sopenharmony_ci return true; 767262306a36Sopenharmony_ci if (do_memsw_account()) 767362306a36Sopenharmony_ci return false; 767462306a36Sopenharmony_ci 767562306a36Sopenharmony_ci memcg = folio_memcg(folio); 767662306a36Sopenharmony_ci if (!memcg) 767762306a36Sopenharmony_ci return false; 767862306a36Sopenharmony_ci 767962306a36Sopenharmony_ci for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 768062306a36Sopenharmony_ci unsigned long usage = page_counter_read(&memcg->swap); 768162306a36Sopenharmony_ci 768262306a36Sopenharmony_ci if (usage * 2 >= READ_ONCE(memcg->swap.high) || 768362306a36Sopenharmony_ci usage * 2 >= READ_ONCE(memcg->swap.max)) 768462306a36Sopenharmony_ci return true; 768562306a36Sopenharmony_ci } 768662306a36Sopenharmony_ci 768762306a36Sopenharmony_ci return false; 768862306a36Sopenharmony_ci} 768962306a36Sopenharmony_ci 769062306a36Sopenharmony_cistatic int __init setup_swap_account(char *s) 769162306a36Sopenharmony_ci{ 769262306a36Sopenharmony_ci bool res; 769362306a36Sopenharmony_ci 769462306a36Sopenharmony_ci if (!kstrtobool(s, &res) && !res) 769562306a36Sopenharmony_ci pr_warn_once("The swapaccount=0 commandline option is deprecated " 769662306a36Sopenharmony_ci "in favor of configuring swap control via cgroupfs. " 769762306a36Sopenharmony_ci "Please report your usecase to linux-mm@kvack.org if you " 769862306a36Sopenharmony_ci "depend on this functionality.\n"); 769962306a36Sopenharmony_ci return 1; 770062306a36Sopenharmony_ci} 770162306a36Sopenharmony_ci__setup("swapaccount=", setup_swap_account); 770262306a36Sopenharmony_ci 770362306a36Sopenharmony_cistatic u64 swap_current_read(struct cgroup_subsys_state *css, 770462306a36Sopenharmony_ci struct cftype *cft) 770562306a36Sopenharmony_ci{ 770662306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 770762306a36Sopenharmony_ci 770862306a36Sopenharmony_ci return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 770962306a36Sopenharmony_ci} 771062306a36Sopenharmony_ci 771162306a36Sopenharmony_cistatic u64 swap_peak_read(struct cgroup_subsys_state *css, 771262306a36Sopenharmony_ci struct cftype *cft) 771362306a36Sopenharmony_ci{ 771462306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 771562306a36Sopenharmony_ci 771662306a36Sopenharmony_ci return (u64)memcg->swap.watermark * PAGE_SIZE; 771762306a36Sopenharmony_ci} 771862306a36Sopenharmony_ci 771962306a36Sopenharmony_cistatic int swap_high_show(struct seq_file *m, void *v) 772062306a36Sopenharmony_ci{ 772162306a36Sopenharmony_ci return seq_puts_memcg_tunable(m, 772262306a36Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 772362306a36Sopenharmony_ci} 772462306a36Sopenharmony_ci 772562306a36Sopenharmony_cistatic ssize_t swap_high_write(struct kernfs_open_file *of, 772662306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 772762306a36Sopenharmony_ci{ 772862306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 772962306a36Sopenharmony_ci unsigned long high; 773062306a36Sopenharmony_ci int err; 773162306a36Sopenharmony_ci 773262306a36Sopenharmony_ci buf = strstrip(buf); 773362306a36Sopenharmony_ci err = page_counter_memparse(buf, "max", &high); 773462306a36Sopenharmony_ci if (err) 773562306a36Sopenharmony_ci return err; 773662306a36Sopenharmony_ci 773762306a36Sopenharmony_ci page_counter_set_high(&memcg->swap, high); 773862306a36Sopenharmony_ci 773962306a36Sopenharmony_ci return nbytes; 774062306a36Sopenharmony_ci} 774162306a36Sopenharmony_ci 774262306a36Sopenharmony_cistatic int swap_max_show(struct seq_file *m, void *v) 774362306a36Sopenharmony_ci{ 774462306a36Sopenharmony_ci return seq_puts_memcg_tunable(m, 774562306a36Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 774662306a36Sopenharmony_ci} 774762306a36Sopenharmony_ci 774862306a36Sopenharmony_cistatic ssize_t swap_max_write(struct kernfs_open_file *of, 774962306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 775062306a36Sopenharmony_ci{ 775162306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 775262306a36Sopenharmony_ci unsigned long max; 775362306a36Sopenharmony_ci int err; 775462306a36Sopenharmony_ci 775562306a36Sopenharmony_ci buf = strstrip(buf); 775662306a36Sopenharmony_ci err = page_counter_memparse(buf, "max", &max); 775762306a36Sopenharmony_ci if (err) 775862306a36Sopenharmony_ci return err; 775962306a36Sopenharmony_ci 776062306a36Sopenharmony_ci xchg(&memcg->swap.max, max); 776162306a36Sopenharmony_ci 776262306a36Sopenharmony_ci return nbytes; 776362306a36Sopenharmony_ci} 776462306a36Sopenharmony_ci 776562306a36Sopenharmony_cistatic int swap_events_show(struct seq_file *m, void *v) 776662306a36Sopenharmony_ci{ 776762306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 776862306a36Sopenharmony_ci 776962306a36Sopenharmony_ci seq_printf(m, "high %lu\n", 777062306a36Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 777162306a36Sopenharmony_ci seq_printf(m, "max %lu\n", 777262306a36Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 777362306a36Sopenharmony_ci seq_printf(m, "fail %lu\n", 777462306a36Sopenharmony_ci atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 777562306a36Sopenharmony_ci 777662306a36Sopenharmony_ci return 0; 777762306a36Sopenharmony_ci} 777862306a36Sopenharmony_ci 777962306a36Sopenharmony_cistatic struct cftype swap_files[] = { 778062306a36Sopenharmony_ci { 778162306a36Sopenharmony_ci .name = "swap.current", 778262306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 778362306a36Sopenharmony_ci .read_u64 = swap_current_read, 778462306a36Sopenharmony_ci }, 778562306a36Sopenharmony_ci { 778662306a36Sopenharmony_ci .name = "swap.high", 778762306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 778862306a36Sopenharmony_ci .seq_show = swap_high_show, 778962306a36Sopenharmony_ci .write = swap_high_write, 779062306a36Sopenharmony_ci }, 779162306a36Sopenharmony_ci { 779262306a36Sopenharmony_ci .name = "swap.max", 779362306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 779462306a36Sopenharmony_ci .seq_show = swap_max_show, 779562306a36Sopenharmony_ci .write = swap_max_write, 779662306a36Sopenharmony_ci }, 779762306a36Sopenharmony_ci { 779862306a36Sopenharmony_ci .name = "swap.peak", 779962306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 780062306a36Sopenharmony_ci .read_u64 = swap_peak_read, 780162306a36Sopenharmony_ci }, 780262306a36Sopenharmony_ci { 780362306a36Sopenharmony_ci .name = "swap.events", 780462306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 780562306a36Sopenharmony_ci .file_offset = offsetof(struct mem_cgroup, swap_events_file), 780662306a36Sopenharmony_ci .seq_show = swap_events_show, 780762306a36Sopenharmony_ci }, 780862306a36Sopenharmony_ci { } /* terminate */ 780962306a36Sopenharmony_ci}; 781062306a36Sopenharmony_ci 781162306a36Sopenharmony_cistatic struct cftype memsw_files[] = { 781262306a36Sopenharmony_ci { 781362306a36Sopenharmony_ci .name = "memsw.usage_in_bytes", 781462306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 781562306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 781662306a36Sopenharmony_ci }, 781762306a36Sopenharmony_ci { 781862306a36Sopenharmony_ci .name = "memsw.max_usage_in_bytes", 781962306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 782062306a36Sopenharmony_ci .write = mem_cgroup_reset, 782162306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 782262306a36Sopenharmony_ci }, 782362306a36Sopenharmony_ci { 782462306a36Sopenharmony_ci .name = "memsw.limit_in_bytes", 782562306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 782662306a36Sopenharmony_ci .write = mem_cgroup_write, 782762306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 782862306a36Sopenharmony_ci }, 782962306a36Sopenharmony_ci { 783062306a36Sopenharmony_ci .name = "memsw.failcnt", 783162306a36Sopenharmony_ci .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 783262306a36Sopenharmony_ci .write = mem_cgroup_reset, 783362306a36Sopenharmony_ci .read_u64 = mem_cgroup_read_u64, 783462306a36Sopenharmony_ci }, 783562306a36Sopenharmony_ci { }, /* terminate */ 783662306a36Sopenharmony_ci}; 783762306a36Sopenharmony_ci 783862306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 783962306a36Sopenharmony_ci/** 784062306a36Sopenharmony_ci * obj_cgroup_may_zswap - check if this cgroup can zswap 784162306a36Sopenharmony_ci * @objcg: the object cgroup 784262306a36Sopenharmony_ci * 784362306a36Sopenharmony_ci * Check if the hierarchical zswap limit has been reached. 784462306a36Sopenharmony_ci * 784562306a36Sopenharmony_ci * This doesn't check for specific headroom, and it is not atomic 784662306a36Sopenharmony_ci * either. But with zswap, the size of the allocation is only known 784762306a36Sopenharmony_ci * once compression has occured, and this optimistic pre-check avoids 784862306a36Sopenharmony_ci * spending cycles on compression when there is already no room left 784962306a36Sopenharmony_ci * or zswap is disabled altogether somewhere in the hierarchy. 785062306a36Sopenharmony_ci */ 785162306a36Sopenharmony_cibool obj_cgroup_may_zswap(struct obj_cgroup *objcg) 785262306a36Sopenharmony_ci{ 785362306a36Sopenharmony_ci struct mem_cgroup *memcg, *original_memcg; 785462306a36Sopenharmony_ci bool ret = true; 785562306a36Sopenharmony_ci 785662306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 785762306a36Sopenharmony_ci return true; 785862306a36Sopenharmony_ci 785962306a36Sopenharmony_ci original_memcg = get_mem_cgroup_from_objcg(objcg); 786062306a36Sopenharmony_ci for (memcg = original_memcg; !mem_cgroup_is_root(memcg); 786162306a36Sopenharmony_ci memcg = parent_mem_cgroup(memcg)) { 786262306a36Sopenharmony_ci unsigned long max = READ_ONCE(memcg->zswap_max); 786362306a36Sopenharmony_ci unsigned long pages; 786462306a36Sopenharmony_ci 786562306a36Sopenharmony_ci if (max == PAGE_COUNTER_MAX) 786662306a36Sopenharmony_ci continue; 786762306a36Sopenharmony_ci if (max == 0) { 786862306a36Sopenharmony_ci ret = false; 786962306a36Sopenharmony_ci break; 787062306a36Sopenharmony_ci } 787162306a36Sopenharmony_ci 787262306a36Sopenharmony_ci cgroup_rstat_flush(memcg->css.cgroup); 787362306a36Sopenharmony_ci pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; 787462306a36Sopenharmony_ci if (pages < max) 787562306a36Sopenharmony_ci continue; 787662306a36Sopenharmony_ci ret = false; 787762306a36Sopenharmony_ci break; 787862306a36Sopenharmony_ci } 787962306a36Sopenharmony_ci mem_cgroup_put(original_memcg); 788062306a36Sopenharmony_ci return ret; 788162306a36Sopenharmony_ci} 788262306a36Sopenharmony_ci 788362306a36Sopenharmony_ci/** 788462306a36Sopenharmony_ci * obj_cgroup_charge_zswap - charge compression backend memory 788562306a36Sopenharmony_ci * @objcg: the object cgroup 788662306a36Sopenharmony_ci * @size: size of compressed object 788762306a36Sopenharmony_ci * 788862306a36Sopenharmony_ci * This forces the charge after obj_cgroup_may_zswap() allowed 788962306a36Sopenharmony_ci * compression and storage in zwap for this cgroup to go ahead. 789062306a36Sopenharmony_ci */ 789162306a36Sopenharmony_civoid obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) 789262306a36Sopenharmony_ci{ 789362306a36Sopenharmony_ci struct mem_cgroup *memcg; 789462306a36Sopenharmony_ci 789562306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 789662306a36Sopenharmony_ci return; 789762306a36Sopenharmony_ci 789862306a36Sopenharmony_ci VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); 789962306a36Sopenharmony_ci 790062306a36Sopenharmony_ci /* PF_MEMALLOC context, charging must succeed */ 790162306a36Sopenharmony_ci if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) 790262306a36Sopenharmony_ci VM_WARN_ON_ONCE(1); 790362306a36Sopenharmony_ci 790462306a36Sopenharmony_ci rcu_read_lock(); 790562306a36Sopenharmony_ci memcg = obj_cgroup_memcg(objcg); 790662306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); 790762306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); 790862306a36Sopenharmony_ci rcu_read_unlock(); 790962306a36Sopenharmony_ci} 791062306a36Sopenharmony_ci 791162306a36Sopenharmony_ci/** 791262306a36Sopenharmony_ci * obj_cgroup_uncharge_zswap - uncharge compression backend memory 791362306a36Sopenharmony_ci * @objcg: the object cgroup 791462306a36Sopenharmony_ci * @size: size of compressed object 791562306a36Sopenharmony_ci * 791662306a36Sopenharmony_ci * Uncharges zswap memory on page in. 791762306a36Sopenharmony_ci */ 791862306a36Sopenharmony_civoid obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) 791962306a36Sopenharmony_ci{ 792062306a36Sopenharmony_ci struct mem_cgroup *memcg; 792162306a36Sopenharmony_ci 792262306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 792362306a36Sopenharmony_ci return; 792462306a36Sopenharmony_ci 792562306a36Sopenharmony_ci obj_cgroup_uncharge(objcg, size); 792662306a36Sopenharmony_ci 792762306a36Sopenharmony_ci rcu_read_lock(); 792862306a36Sopenharmony_ci memcg = obj_cgroup_memcg(objcg); 792962306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); 793062306a36Sopenharmony_ci mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); 793162306a36Sopenharmony_ci rcu_read_unlock(); 793262306a36Sopenharmony_ci} 793362306a36Sopenharmony_ci 793462306a36Sopenharmony_cistatic u64 zswap_current_read(struct cgroup_subsys_state *css, 793562306a36Sopenharmony_ci struct cftype *cft) 793662306a36Sopenharmony_ci{ 793762306a36Sopenharmony_ci cgroup_rstat_flush(css->cgroup); 793862306a36Sopenharmony_ci return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); 793962306a36Sopenharmony_ci} 794062306a36Sopenharmony_ci 794162306a36Sopenharmony_cistatic int zswap_max_show(struct seq_file *m, void *v) 794262306a36Sopenharmony_ci{ 794362306a36Sopenharmony_ci return seq_puts_memcg_tunable(m, 794462306a36Sopenharmony_ci READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); 794562306a36Sopenharmony_ci} 794662306a36Sopenharmony_ci 794762306a36Sopenharmony_cistatic ssize_t zswap_max_write(struct kernfs_open_file *of, 794862306a36Sopenharmony_ci char *buf, size_t nbytes, loff_t off) 794962306a36Sopenharmony_ci{ 795062306a36Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 795162306a36Sopenharmony_ci unsigned long max; 795262306a36Sopenharmony_ci int err; 795362306a36Sopenharmony_ci 795462306a36Sopenharmony_ci buf = strstrip(buf); 795562306a36Sopenharmony_ci err = page_counter_memparse(buf, "max", &max); 795662306a36Sopenharmony_ci if (err) 795762306a36Sopenharmony_ci return err; 795862306a36Sopenharmony_ci 795962306a36Sopenharmony_ci xchg(&memcg->zswap_max, max); 796062306a36Sopenharmony_ci 796162306a36Sopenharmony_ci return nbytes; 796262306a36Sopenharmony_ci} 796362306a36Sopenharmony_ci 796462306a36Sopenharmony_cistatic struct cftype zswap_files[] = { 796562306a36Sopenharmony_ci { 796662306a36Sopenharmony_ci .name = "zswap.current", 796762306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 796862306a36Sopenharmony_ci .read_u64 = zswap_current_read, 796962306a36Sopenharmony_ci }, 797062306a36Sopenharmony_ci { 797162306a36Sopenharmony_ci .name = "zswap.max", 797262306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 797362306a36Sopenharmony_ci .seq_show = zswap_max_show, 797462306a36Sopenharmony_ci .write = zswap_max_write, 797562306a36Sopenharmony_ci }, 797662306a36Sopenharmony_ci { } /* terminate */ 797762306a36Sopenharmony_ci}; 797862306a36Sopenharmony_ci#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ 797962306a36Sopenharmony_ci 798062306a36Sopenharmony_cistatic int __init mem_cgroup_swap_init(void) 798162306a36Sopenharmony_ci{ 798262306a36Sopenharmony_ci if (mem_cgroup_disabled()) 798362306a36Sopenharmony_ci return 0; 798462306a36Sopenharmony_ci 798562306a36Sopenharmony_ci WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 798662306a36Sopenharmony_ci WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 798762306a36Sopenharmony_ci#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 798862306a36Sopenharmony_ci WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); 798962306a36Sopenharmony_ci#endif 799062306a36Sopenharmony_ci return 0; 799162306a36Sopenharmony_ci} 799262306a36Sopenharmony_cisubsys_initcall(mem_cgroup_swap_init); 799362306a36Sopenharmony_ci 799462306a36Sopenharmony_ci#endif /* CONFIG_SWAP */ 7995