1// SPDX-License-Identifier: GPL-2.0-or-later 2/* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 */ 24 25#include <linux/page_counter.h> 26#include <linux/memcontrol.h> 27#include <linux/cgroup.h> 28#include <linux/pagewalk.h> 29#include <linux/sched/mm.h> 30#include <linux/shmem_fs.h> 31#include <linux/hugetlb.h> 32#include <linux/pagemap.h> 33#include <linux/vm_event_item.h> 34#include <linux/smp.h> 35#include <linux/page-flags.h> 36#include <linux/backing-dev.h> 37#include <linux/bit_spinlock.h> 38#include <linux/rcupdate.h> 39#include <linux/limits.h> 40#include <linux/export.h> 41#include <linux/mutex.h> 42#include <linux/rbtree.h> 43#include <linux/slab.h> 44#include <linux/swap.h> 45#include <linux/swapops.h> 46#include <linux/spinlock.h> 47#include <linux/eventfd.h> 48#include <linux/poll.h> 49#include <linux/sort.h> 50#include <linux/fs.h> 51#include <linux/seq_file.h> 52#include <linux/vmpressure.h> 53#include <linux/mm_inline.h> 54#include <linux/swap_cgroup.h> 55#include <linux/cpu.h> 56#include <linux/oom.h> 57#include <linux/lockdep.h> 58#include <linux/file.h> 59#include <linux/tracehook.h> 60#include <linux/psi.h> 61#include <linux/seq_buf.h> 62#include "internal.h" 63#include <net/sock.h> 64#include <net/ip.h> 65#include "slab.h" 66 67#include <linux/uaccess.h> 68#include <linux/zswapd.h> 69 70#include <trace/events/vmscan.h> 71 72struct cgroup_subsys memory_cgrp_subsys __read_mostly; 73EXPORT_SYMBOL(memory_cgrp_subsys); 74 75struct mem_cgroup *root_mem_cgroup __read_mostly; 76 77/* Active memory cgroup to use from an interrupt context */ 78DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 79 80/* Socket memory accounting disabled? */ 81static bool cgroup_memory_nosocket; 82 83/* Kernel memory accounting disabled */ 84static bool cgroup_memory_nokmem = true; 85 86/* Whether the swap controller is active */ 87#ifdef CONFIG_MEMCG_SWAP 88bool cgroup_memory_noswap __read_mostly; 89#else 90#define cgroup_memory_noswap 1 91#endif 92 93#ifdef CONFIG_CGROUP_WRITEBACK 94static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 95#endif 96 97/* Whether legacy memory+swap accounting is active */ 98static bool do_memsw_account(void) 99{ 100 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; 101} 102 103#define THRESHOLDS_EVENTS_TARGET 128 104#define SOFTLIMIT_EVENTS_TARGET 1024 105 106/* 107 * Cgroups above their limits are maintained in a RB-Tree, independent of 108 * their hierarchy representation 109 */ 110 111struct mem_cgroup_tree_per_node { 112 struct rb_root rb_root; 113 struct rb_node *rb_rightmost; 114 spinlock_t lock; 115}; 116 117struct mem_cgroup_tree { 118 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 119}; 120 121static struct mem_cgroup_tree soft_limit_tree __read_mostly; 122 123/* for OOM */ 124struct mem_cgroup_eventfd_list { 125 struct list_head list; 126 struct eventfd_ctx *eventfd; 127}; 128 129/* 130 * cgroup_event represents events which userspace want to receive. 131 */ 132struct mem_cgroup_event { 133 /* 134 * memcg which the event belongs to. 135 */ 136 struct mem_cgroup *memcg; 137 /* 138 * eventfd to signal userspace about the event. 139 */ 140 struct eventfd_ctx *eventfd; 141 /* 142 * Each of these stored in a list by the cgroup. 143 */ 144 struct list_head list; 145 /* 146 * register_event() callback will be used to add new userspace 147 * waiter for changes related to this event. Use eventfd_signal() 148 * on eventfd to send notification to userspace. 149 */ 150 int (*register_event)(struct mem_cgroup *memcg, 151 struct eventfd_ctx *eventfd, const char *args); 152 /* 153 * unregister_event() callback will be called when userspace closes 154 * the eventfd or on cgroup removing. This callback must be set, 155 * if you want provide notification functionality. 156 */ 157 void (*unregister_event)(struct mem_cgroup *memcg, 158 struct eventfd_ctx *eventfd); 159 /* 160 * All fields below needed to unregister event when 161 * userspace closes eventfd. 162 */ 163 poll_table pt; 164 wait_queue_head_t *wqh; 165 wait_queue_entry_t wait; 166 struct work_struct remove; 167}; 168 169static void mem_cgroup_threshold(struct mem_cgroup *memcg); 170static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 171 172/* Stuffs for move charges at task migration. */ 173/* 174 * Types of charges to be moved. 175 */ 176#define MOVE_ANON 0x1U 177#define MOVE_FILE 0x2U 178#define MOVE_MASK (MOVE_ANON | MOVE_FILE) 179 180/* "mc" and its members are protected by cgroup_mutex */ 181static struct move_charge_struct { 182 spinlock_t lock; /* for from, to */ 183 struct mm_struct *mm; 184 struct mem_cgroup *from; 185 struct mem_cgroup *to; 186 unsigned long flags; 187 unsigned long precharge; 188 unsigned long moved_charge; 189 unsigned long moved_swap; 190 struct task_struct *moving_task; /* a task moving charges */ 191 wait_queue_head_t waitq; /* a waitq for other context */ 192} mc = { 193 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 194 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 195}; 196 197/* 198 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 199 * limit reclaim to prevent infinite loops, if they ever occur. 200 */ 201#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 202#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 203 204/* for encoding cft->private value on file */ 205enum res_type { 206 _MEM, 207 _MEMSWAP, 208 _OOM_TYPE, 209 _KMEM, 210 _TCP, 211}; 212 213#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 214#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 215#define MEMFILE_ATTR(val) ((val) & 0xffff) 216/* Used for OOM nofiier */ 217#define OOM_CONTROL (0) 218 219/* 220 * Iteration constructs for visiting all cgroups (under a tree). If 221 * loops are exited prematurely (break), mem_cgroup_iter_break() must 222 * be used for reference counting. 223 */ 224#define for_each_mem_cgroup_tree(iter, root) \ 225 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 226 iter != NULL; \ 227 iter = mem_cgroup_iter(root, iter, NULL)) 228 229#define for_each_mem_cgroup(iter) \ 230 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 231 iter != NULL; \ 232 iter = mem_cgroup_iter(NULL, iter, NULL)) 233 234static inline bool task_is_dying(void) 235{ 236 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 237 (current->flags & PF_EXITING); 238} 239 240/* Some nice accessors for the vmpressure. */ 241struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 242{ 243 if (!memcg) 244 memcg = root_mem_cgroup; 245 return &memcg->vmpressure; 246} 247 248struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 249{ 250 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 251} 252 253#ifdef CONFIG_MEMCG_KMEM 254static DEFINE_SPINLOCK(objcg_lock); 255 256static void obj_cgroup_release(struct percpu_ref *ref) 257{ 258 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 259 struct mem_cgroup *memcg; 260 unsigned int nr_bytes; 261 unsigned int nr_pages; 262 unsigned long flags; 263 264 /* 265 * At this point all allocated objects are freed, and 266 * objcg->nr_charged_bytes can't have an arbitrary byte value. 267 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 268 * 269 * The following sequence can lead to it: 270 * 1) CPU0: objcg == stock->cached_objcg 271 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 272 * PAGE_SIZE bytes are charged 273 * 3) CPU1: a process from another memcg is allocating something, 274 * the stock if flushed, 275 * objcg->nr_charged_bytes = PAGE_SIZE - 92 276 * 5) CPU0: we do release this object, 277 * 92 bytes are added to stock->nr_bytes 278 * 6) CPU0: stock is flushed, 279 * 92 bytes are added to objcg->nr_charged_bytes 280 * 281 * In the result, nr_charged_bytes == PAGE_SIZE. 282 * This page will be uncharged in obj_cgroup_release(). 283 */ 284 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 285 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 286 nr_pages = nr_bytes >> PAGE_SHIFT; 287 288 spin_lock_irqsave(&objcg_lock, flags); 289 memcg = obj_cgroup_memcg(objcg); 290 if (nr_pages) 291 __memcg_kmem_uncharge(memcg, nr_pages); 292 list_del(&objcg->list); 293 mem_cgroup_put(memcg); 294 spin_unlock_irqrestore(&objcg_lock, flags); 295 296 percpu_ref_exit(ref); 297 kfree_rcu(objcg, rcu); 298} 299 300static struct obj_cgroup *obj_cgroup_alloc(void) 301{ 302 struct obj_cgroup *objcg; 303 int ret; 304 305 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 306 if (!objcg) 307 return NULL; 308 309 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 310 GFP_KERNEL); 311 if (ret) { 312 kfree(objcg); 313 return NULL; 314 } 315 INIT_LIST_HEAD(&objcg->list); 316 return objcg; 317} 318 319static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 320 struct mem_cgroup *parent) 321{ 322 struct obj_cgroup *objcg, *iter; 323 324 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 325 326 spin_lock_irq(&objcg_lock); 327 328 /* Move active objcg to the parent's list */ 329 xchg(&objcg->memcg, parent); 330 css_get(&parent->css); 331 list_add(&objcg->list, &parent->objcg_list); 332 333 /* Move already reparented objcgs to the parent's list */ 334 list_for_each_entry(iter, &memcg->objcg_list, list) { 335 css_get(&parent->css); 336 xchg(&iter->memcg, parent); 337 css_put(&memcg->css); 338 } 339 list_splice(&memcg->objcg_list, &parent->objcg_list); 340 341 spin_unlock_irq(&objcg_lock); 342 343 percpu_ref_kill(&objcg->refcnt); 344} 345 346/* 347 * This will be used as a shrinker list's index. 348 * The main reason for not using cgroup id for this: 349 * this works better in sparse environments, where we have a lot of memcgs, 350 * but only a few kmem-limited. Or also, if we have, for instance, 200 351 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 352 * 200 entry array for that. 353 * 354 * The current size of the caches array is stored in memcg_nr_cache_ids. It 355 * will double each time we have to increase it. 356 */ 357static DEFINE_IDA(memcg_cache_ida); 358int memcg_nr_cache_ids; 359 360/* Protects memcg_nr_cache_ids */ 361static DECLARE_RWSEM(memcg_cache_ids_sem); 362 363void memcg_get_cache_ids(void) 364{ 365 down_read(&memcg_cache_ids_sem); 366} 367 368void memcg_put_cache_ids(void) 369{ 370 up_read(&memcg_cache_ids_sem); 371} 372 373/* 374 * MIN_SIZE is different than 1, because we would like to avoid going through 375 * the alloc/free process all the time. In a small machine, 4 kmem-limited 376 * cgroups is a reasonable guess. In the future, it could be a parameter or 377 * tunable, but that is strictly not necessary. 378 * 379 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 380 * this constant directly from cgroup, but it is understandable that this is 381 * better kept as an internal representation in cgroup.c. In any case, the 382 * cgrp_id space is not getting any smaller, and we don't have to necessarily 383 * increase ours as well if it increases. 384 */ 385#define MEMCG_CACHES_MIN_SIZE 4 386#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 387 388/* 389 * A lot of the calls to the cache allocation functions are expected to be 390 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 391 * conditional to this static branch, we'll have to allow modules that does 392 * kmem_cache_alloc and the such to see this symbol as well 393 */ 394DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 395EXPORT_SYMBOL(memcg_kmem_enabled_key); 396#endif 397 398static int memcg_shrinker_map_size; 399static DEFINE_MUTEX(memcg_shrinker_map_mutex); 400 401static void memcg_free_shrinker_map_rcu(struct rcu_head *head) 402{ 403 kvfree(container_of(head, struct memcg_shrinker_map, rcu)); 404} 405 406static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, 407 int size, int old_size) 408{ 409 struct memcg_shrinker_map *new, *old; 410 int nid; 411 412 lockdep_assert_held(&memcg_shrinker_map_mutex); 413 414 for_each_node(nid) { 415 old = rcu_dereference_protected( 416 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); 417 /* Not yet online memcg */ 418 if (!old) 419 return 0; 420 421 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 422 if (!new) 423 return -ENOMEM; 424 425 /* Set all old bits, clear all new bits */ 426 memset(new->map, (int)0xff, old_size); 427 memset((void *)new->map + old_size, 0, size - old_size); 428 429 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); 430 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); 431 } 432 433 return 0; 434} 435 436static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) 437{ 438 struct mem_cgroup_per_node *pn; 439 struct memcg_shrinker_map *map; 440 int nid; 441 442 if (mem_cgroup_is_root(memcg)) 443 return; 444 445 for_each_node(nid) { 446 pn = mem_cgroup_nodeinfo(memcg, nid); 447 map = rcu_dereference_protected(pn->shrinker_map, true); 448 if (map) 449 kvfree(map); 450 rcu_assign_pointer(pn->shrinker_map, NULL); 451 } 452} 453 454static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 455{ 456 struct memcg_shrinker_map *map; 457 int nid, size, ret = 0; 458 459 if (mem_cgroup_is_root(memcg)) 460 return 0; 461 462 mutex_lock(&memcg_shrinker_map_mutex); 463 size = memcg_shrinker_map_size; 464 for_each_node(nid) { 465 map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); 466 if (!map) { 467 memcg_free_shrinker_maps(memcg); 468 ret = -ENOMEM; 469 break; 470 } 471 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); 472 } 473 mutex_unlock(&memcg_shrinker_map_mutex); 474 475 return ret; 476} 477 478int memcg_expand_shrinker_maps(int new_id) 479{ 480 int size, old_size, ret = 0; 481 struct mem_cgroup *memcg; 482 483 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); 484 old_size = memcg_shrinker_map_size; 485 if (size <= old_size) 486 return 0; 487 488 mutex_lock(&memcg_shrinker_map_mutex); 489 if (!root_mem_cgroup) 490 goto unlock; 491 492 for_each_mem_cgroup(memcg) { 493 if (mem_cgroup_is_root(memcg)) 494 continue; 495 ret = memcg_expand_one_shrinker_map(memcg, size, old_size); 496 if (ret) { 497 mem_cgroup_iter_break(NULL, memcg); 498 goto unlock; 499 } 500 } 501unlock: 502 if (!ret) 503 memcg_shrinker_map_size = size; 504 mutex_unlock(&memcg_shrinker_map_mutex); 505 return ret; 506} 507 508void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 509{ 510 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 511 struct memcg_shrinker_map *map; 512 513 rcu_read_lock(); 514 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); 515 /* Pairs with smp mb in shrink_slab() */ 516 smp_mb__before_atomic(); 517 set_bit(shrinker_id, map->map); 518 rcu_read_unlock(); 519 } 520} 521 522/** 523 * mem_cgroup_css_from_page - css of the memcg associated with a page 524 * @page: page of interest 525 * 526 * If memcg is bound to the default hierarchy, css of the memcg associated 527 * with @page is returned. The returned css remains associated with @page 528 * until it is released. 529 * 530 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 531 * is returned. 532 */ 533struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 534{ 535 struct mem_cgroup *memcg; 536 537 memcg = page->mem_cgroup; 538 539 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 540 memcg = root_mem_cgroup; 541 542 return &memcg->css; 543} 544 545/** 546 * page_cgroup_ino - return inode number of the memcg a page is charged to 547 * @page: the page 548 * 549 * Look up the closest online ancestor of the memory cgroup @page is charged to 550 * and return its inode number or 0 if @page is not charged to any cgroup. It 551 * is safe to call this function without holding a reference to @page. 552 * 553 * Note, this function is inherently racy, because there is nothing to prevent 554 * the cgroup inode from getting torn down and potentially reallocated a moment 555 * after page_cgroup_ino() returns, so it only should be used by callers that 556 * do not care (such as procfs interfaces). 557 */ 558ino_t page_cgroup_ino(struct page *page) 559{ 560 struct mem_cgroup *memcg; 561 unsigned long ino = 0; 562 563 rcu_read_lock(); 564 memcg = page->mem_cgroup; 565 566 /* 567 * The lowest bit set means that memcg isn't a valid 568 * memcg pointer, but a obj_cgroups pointer. 569 * In this case the page is shared and doesn't belong 570 * to any specific memory cgroup. 571 */ 572 if ((unsigned long) memcg & 0x1UL) 573 memcg = NULL; 574 575 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 576 memcg = parent_mem_cgroup(memcg); 577 if (memcg) 578 ino = cgroup_ino(memcg->css.cgroup); 579 rcu_read_unlock(); 580 return ino; 581} 582 583static struct mem_cgroup_per_node * 584mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 585{ 586 int nid = page_to_nid(page); 587 588 return memcg->nodeinfo[nid]; 589} 590 591static struct mem_cgroup_tree_per_node * 592soft_limit_tree_node(int nid) 593{ 594 return soft_limit_tree.rb_tree_per_node[nid]; 595} 596 597static struct mem_cgroup_tree_per_node * 598soft_limit_tree_from_page(struct page *page) 599{ 600 int nid = page_to_nid(page); 601 602 return soft_limit_tree.rb_tree_per_node[nid]; 603} 604 605static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 606 struct mem_cgroup_tree_per_node *mctz, 607 unsigned long new_usage_in_excess) 608{ 609 struct rb_node **p = &mctz->rb_root.rb_node; 610 struct rb_node *parent = NULL; 611 struct mem_cgroup_per_node *mz_node; 612 bool rightmost = true; 613 614 if (mz->on_tree) 615 return; 616 617 mz->usage_in_excess = new_usage_in_excess; 618 if (!mz->usage_in_excess) 619 return; 620 while (*p) { 621 parent = *p; 622 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 623 tree_node); 624 if (mz->usage_in_excess < mz_node->usage_in_excess) { 625 p = &(*p)->rb_left; 626 rightmost = false; 627 } 628 629 /* 630 * We can't avoid mem cgroups that are over their soft 631 * limit by the same amount 632 */ 633 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 634 p = &(*p)->rb_right; 635 } 636 637 if (rightmost) 638 mctz->rb_rightmost = &mz->tree_node; 639 640 rb_link_node(&mz->tree_node, parent, p); 641 rb_insert_color(&mz->tree_node, &mctz->rb_root); 642 mz->on_tree = true; 643} 644 645static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 646 struct mem_cgroup_tree_per_node *mctz) 647{ 648 if (!mz->on_tree) 649 return; 650 651 if (&mz->tree_node == mctz->rb_rightmost) 652 mctz->rb_rightmost = rb_prev(&mz->tree_node); 653 654 rb_erase(&mz->tree_node, &mctz->rb_root); 655 mz->on_tree = false; 656} 657 658static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 659 struct mem_cgroup_tree_per_node *mctz) 660{ 661 unsigned long flags; 662 663 spin_lock_irqsave(&mctz->lock, flags); 664 __mem_cgroup_remove_exceeded(mz, mctz); 665 spin_unlock_irqrestore(&mctz->lock, flags); 666} 667 668static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 669{ 670#ifdef CONFIG_HYPERHOLD_FILE_LRU 671 struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0); 672 struct lruvec *lruvec = &mz->lruvec; 673 unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, 674 MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, 675 MAX_NR_ZONES); 676#else 677 unsigned long nr_pages = page_counter_read(&memcg->memory); 678#endif 679 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 680 unsigned long excess = 0; 681 682 if (nr_pages > soft_limit) 683 excess = nr_pages - soft_limit; 684 685 return excess; 686} 687 688static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 689{ 690 unsigned long excess; 691 struct mem_cgroup_per_node *mz; 692 struct mem_cgroup_tree_per_node *mctz; 693 694 mctz = soft_limit_tree_from_page(page); 695 if (!mctz) 696 return; 697 /* 698 * Necessary to update all ancestors when hierarchy is used. 699 * because their event counter is not touched. 700 */ 701 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 702 mz = mem_cgroup_page_nodeinfo(memcg, page); 703 excess = soft_limit_excess(memcg); 704 /* 705 * We have to update the tree if mz is on RB-tree or 706 * mem is over its softlimit. 707 */ 708 if (excess || mz->on_tree) { 709 unsigned long flags; 710 711 spin_lock_irqsave(&mctz->lock, flags); 712 /* if on-tree, remove it */ 713 if (mz->on_tree) 714 __mem_cgroup_remove_exceeded(mz, mctz); 715 /* 716 * Insert again. mz->usage_in_excess will be updated. 717 * If excess is 0, no tree ops. 718 */ 719 __mem_cgroup_insert_exceeded(mz, mctz, excess); 720 spin_unlock_irqrestore(&mctz->lock, flags); 721 } 722 } 723} 724 725static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 726{ 727 struct mem_cgroup_tree_per_node *mctz; 728 struct mem_cgroup_per_node *mz; 729 int nid; 730 731 for_each_node(nid) { 732 mz = mem_cgroup_nodeinfo(memcg, nid); 733 mctz = soft_limit_tree_node(nid); 734 if (mctz) 735 mem_cgroup_remove_exceeded(mz, mctz); 736 } 737} 738 739static struct mem_cgroup_per_node * 740__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 741{ 742 struct mem_cgroup_per_node *mz; 743 744retry: 745 mz = NULL; 746 if (!mctz->rb_rightmost) 747 goto done; /* Nothing to reclaim from */ 748 749 mz = rb_entry(mctz->rb_rightmost, 750 struct mem_cgroup_per_node, tree_node); 751 /* 752 * Remove the node now but someone else can add it back, 753 * we will to add it back at the end of reclaim to its correct 754 * position in the tree. 755 */ 756 __mem_cgroup_remove_exceeded(mz, mctz); 757 if (!soft_limit_excess(mz->memcg) || 758 !css_tryget(&mz->memcg->css)) 759 goto retry; 760done: 761 return mz; 762} 763 764static struct mem_cgroup_per_node * 765mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 766{ 767 struct mem_cgroup_per_node *mz; 768 769 spin_lock_irq(&mctz->lock); 770 mz = __mem_cgroup_largest_soft_limit_node(mctz); 771 spin_unlock_irq(&mctz->lock); 772 return mz; 773} 774 775/** 776 * __mod_memcg_state - update cgroup memory statistics 777 * @memcg: the memory cgroup 778 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 779 * @val: delta to add to the counter, can be negative 780 */ 781void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 782{ 783 long x, threshold = MEMCG_CHARGE_BATCH; 784 785 if (mem_cgroup_disabled()) 786 return; 787 788 if (memcg_stat_item_in_bytes(idx)) 789 threshold <<= PAGE_SHIFT; 790 791 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); 792 if (unlikely(abs(x) > threshold)) { 793 struct mem_cgroup *mi; 794 795 /* 796 * Batch local counters to keep them in sync with 797 * the hierarchical ones. 798 */ 799 __this_cpu_add(memcg->vmstats_local->stat[idx], x); 800 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 801 atomic_long_add(x, &mi->vmstats[idx]); 802 x = 0; 803 } 804 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); 805} 806 807static struct mem_cgroup_per_node * 808parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) 809{ 810 struct mem_cgroup *parent; 811 812 parent = parent_mem_cgroup(pn->memcg); 813 if (!parent) 814 return NULL; 815 return mem_cgroup_nodeinfo(parent, nid); 816} 817 818void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 819 int val) 820{ 821 struct mem_cgroup_per_node *pn; 822 struct mem_cgroup *memcg; 823 long x, threshold = MEMCG_CHARGE_BATCH; 824 825 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 826 memcg = pn->memcg; 827 828 /* Update memcg */ 829 __mod_memcg_state(memcg, idx, val); 830 831 /* Update lruvec */ 832 __this_cpu_add(pn->lruvec_stat_local->count[idx], val); 833 834 if (vmstat_item_in_bytes(idx)) 835 threshold <<= PAGE_SHIFT; 836 837 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); 838 if (unlikely(abs(x) > threshold)) { 839 pg_data_t *pgdat = lruvec_pgdat(lruvec); 840 struct mem_cgroup_per_node *pi; 841 842 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) 843 atomic_long_add(x, &pi->lruvec_stat[idx]); 844 x = 0; 845 } 846 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); 847} 848 849/** 850 * __mod_lruvec_state - update lruvec memory statistics 851 * @lruvec: the lruvec 852 * @idx: the stat item 853 * @val: delta to add to the counter, can be negative 854 * 855 * The lruvec is the intersection of the NUMA node and a cgroup. This 856 * function updates the all three counters that are affected by a 857 * change of state at this level: per-node, per-cgroup, per-lruvec. 858 */ 859void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 860 int val) 861{ 862 /* Update node */ 863 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 864 865 /* Update memcg and lruvec */ 866 if (!mem_cgroup_disabled()) { 867#ifdef CONFIG_HYPERHOLD_FILE_LRU 868 if (is_node_lruvec(lruvec)) 869 return; 870#endif 871 __mod_memcg_lruvec_state(lruvec, idx, val); 872 } 873} 874 875void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) 876{ 877 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 878 struct mem_cgroup *memcg; 879 struct lruvec *lruvec; 880 881 rcu_read_lock(); 882 memcg = mem_cgroup_from_obj(p); 883 884 /* 885 * Untracked pages have no memcg, no lruvec. Update only the 886 * node. If we reparent the slab objects to the root memcg, 887 * when we free the slab object, we need to update the per-memcg 888 * vmstats to keep it correct for the root memcg. 889 */ 890 if (!memcg) { 891 __mod_node_page_state(pgdat, idx, val); 892 } else { 893 lruvec = mem_cgroup_lruvec(memcg, pgdat); 894 __mod_lruvec_state(lruvec, idx, val); 895 } 896 rcu_read_unlock(); 897} 898 899void mod_memcg_obj_state(void *p, int idx, int val) 900{ 901 struct mem_cgroup *memcg; 902 903 rcu_read_lock(); 904 memcg = mem_cgroup_from_obj(p); 905 if (memcg) 906 mod_memcg_state(memcg, idx, val); 907 rcu_read_unlock(); 908} 909 910/** 911 * __count_memcg_events - account VM events in a cgroup 912 * @memcg: the memory cgroup 913 * @idx: the event item 914 * @count: the number of events that occured 915 */ 916void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 917 unsigned long count) 918{ 919 unsigned long x; 920 921 if (mem_cgroup_disabled()) 922 return; 923#ifdef CONFIG_HYPERHOLD_FILE_LRU 924 if (!memcg) 925 return; 926#endif 927 928 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); 929 if (unlikely(x > MEMCG_CHARGE_BATCH)) { 930 struct mem_cgroup *mi; 931 932 /* 933 * Batch local counters to keep them in sync with 934 * the hierarchical ones. 935 */ 936 __this_cpu_add(memcg->vmstats_local->events[idx], x); 937 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 938 atomic_long_add(x, &mi->vmevents[idx]); 939 x = 0; 940 } 941 __this_cpu_write(memcg->vmstats_percpu->events[idx], x); 942} 943 944static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 945{ 946 return atomic_long_read(&memcg->vmevents[event]); 947} 948 949static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 950{ 951 long x = 0; 952 int cpu; 953 954 for_each_possible_cpu(cpu) 955 x += per_cpu(memcg->vmstats_local->events[event], cpu); 956 return x; 957} 958 959static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 960 struct page *page, 961 int nr_pages) 962{ 963 /* pagein of a big page is an event. So, ignore page size */ 964 if (nr_pages > 0) 965 __count_memcg_events(memcg, PGPGIN, 1); 966 else { 967 __count_memcg_events(memcg, PGPGOUT, 1); 968 nr_pages = -nr_pages; /* for event */ 969 } 970 971 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 972} 973 974static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 975 enum mem_cgroup_events_target target) 976{ 977 unsigned long val, next; 978 979 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 980 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 981 /* from time_after() in jiffies.h */ 982 if ((long)(next - val) < 0) { 983 switch (target) { 984 case MEM_CGROUP_TARGET_THRESH: 985 next = val + THRESHOLDS_EVENTS_TARGET; 986 break; 987 case MEM_CGROUP_TARGET_SOFTLIMIT: 988 next = val + SOFTLIMIT_EVENTS_TARGET; 989 break; 990 default: 991 break; 992 } 993 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 994 return true; 995 } 996 return false; 997} 998 999/* 1000 * Check events in order. 1001 * 1002 */ 1003static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1004{ 1005 /* threshold event is triggered in finer grain than soft limit */ 1006 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1007 MEM_CGROUP_TARGET_THRESH))) { 1008 bool do_softlimit; 1009 1010 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1011 MEM_CGROUP_TARGET_SOFTLIMIT); 1012 mem_cgroup_threshold(memcg); 1013 if (unlikely(do_softlimit)) 1014 mem_cgroup_update_tree(memcg, page); 1015 } 1016} 1017 1018struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1019{ 1020 /* 1021 * mm_update_next_owner() may clear mm->owner to NULL 1022 * if it races with swapoff, page migration, etc. 1023 * So this can be called with p == NULL. 1024 */ 1025 if (unlikely(!p)) 1026 return NULL; 1027 1028 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1029} 1030EXPORT_SYMBOL(mem_cgroup_from_task); 1031 1032/** 1033 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 1034 * @mm: mm from which memcg should be extracted. It can be NULL. 1035 * 1036 * Obtain a reference on mm->memcg and returns it if successful. Otherwise 1037 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is 1038 * returned. 1039 */ 1040struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1041{ 1042 struct mem_cgroup *memcg; 1043 1044 if (mem_cgroup_disabled()) 1045 return NULL; 1046 1047 rcu_read_lock(); 1048 do { 1049 /* 1050 * Page cache insertions can happen withou an 1051 * actual mm context, e.g. during disk probing 1052 * on boot, loopback IO, acct() writes etc. 1053 */ 1054 if (unlikely(!mm)) 1055 memcg = root_mem_cgroup; 1056 else { 1057 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1058 if (unlikely(!memcg)) 1059 memcg = root_mem_cgroup; 1060 } 1061 } while (!css_tryget(&memcg->css)); 1062 rcu_read_unlock(); 1063 return memcg; 1064} 1065EXPORT_SYMBOL(get_mem_cgroup_from_mm); 1066 1067/** 1068 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. 1069 * @page: page from which memcg should be extracted. 1070 * 1071 * Obtain a reference on page->memcg and returns it if successful. Otherwise 1072 * root_mem_cgroup is returned. 1073 */ 1074struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) 1075{ 1076 struct mem_cgroup *memcg = page->mem_cgroup; 1077 1078 if (mem_cgroup_disabled()) 1079 return NULL; 1080 1081 rcu_read_lock(); 1082 /* Page should not get uncharged and freed memcg under us. */ 1083 if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) 1084 memcg = root_mem_cgroup; 1085 rcu_read_unlock(); 1086 return memcg; 1087} 1088EXPORT_SYMBOL(get_mem_cgroup_from_page); 1089 1090static __always_inline struct mem_cgroup *active_memcg(void) 1091{ 1092 if (in_interrupt()) 1093 return this_cpu_read(int_active_memcg); 1094 else 1095 return current->active_memcg; 1096} 1097 1098static __always_inline struct mem_cgroup *get_active_memcg(void) 1099{ 1100 struct mem_cgroup *memcg; 1101 1102 rcu_read_lock(); 1103 memcg = active_memcg(); 1104 /* remote memcg must hold a ref. */ 1105 if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) 1106 memcg = root_mem_cgroup; 1107 rcu_read_unlock(); 1108 1109 return memcg; 1110} 1111 1112static __always_inline bool memcg_kmem_bypass(void) 1113{ 1114 /* Allow remote memcg charging from any context. */ 1115 if (unlikely(active_memcg())) 1116 return false; 1117 1118 /* Memcg to charge can't be determined. */ 1119 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 1120 return true; 1121 1122 return false; 1123} 1124 1125/** 1126 * If active memcg is set, do not fallback to current->mm->memcg. 1127 */ 1128static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 1129{ 1130 if (memcg_kmem_bypass()) 1131 return NULL; 1132 1133 if (unlikely(active_memcg())) 1134 return get_active_memcg(); 1135 1136 return get_mem_cgroup_from_mm(current->mm); 1137} 1138 1139/** 1140 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1141 * @root: hierarchy root 1142 * @prev: previously returned memcg, NULL on first invocation 1143 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1144 * 1145 * Returns references to children of the hierarchy below @root, or 1146 * @root itself, or %NULL after a full round-trip. 1147 * 1148 * Caller must pass the return value in @prev on subsequent 1149 * invocations for reference counting, or use mem_cgroup_iter_break() 1150 * to cancel a hierarchy walk before the round-trip is complete. 1151 * 1152 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1153 * in the hierarchy among all concurrent reclaimers operating on the 1154 * same node. 1155 */ 1156struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1157 struct mem_cgroup *prev, 1158 struct mem_cgroup_reclaim_cookie *reclaim) 1159{ 1160 struct mem_cgroup_reclaim_iter *iter; 1161 struct cgroup_subsys_state *css = NULL; 1162 struct mem_cgroup *memcg = NULL; 1163 struct mem_cgroup *pos = NULL; 1164 1165 if (mem_cgroup_disabled()) 1166 return NULL; 1167 1168 if (!root) 1169 root = root_mem_cgroup; 1170 1171 if (prev && !reclaim) 1172 pos = prev; 1173 1174 if (!root->use_hierarchy && root != root_mem_cgroup) { 1175 if (prev) 1176 goto out; 1177 return root; 1178 } 1179 1180 rcu_read_lock(); 1181 1182 if (reclaim) { 1183 struct mem_cgroup_per_node *mz; 1184 1185 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); 1186 iter = &mz->iter; 1187 1188 if (prev && reclaim->generation != iter->generation) 1189 goto out_unlock; 1190 1191 while (1) { 1192 pos = READ_ONCE(iter->position); 1193 if (!pos || css_tryget(&pos->css)) 1194 break; 1195 /* 1196 * css reference reached zero, so iter->position will 1197 * be cleared by ->css_released. However, we should not 1198 * rely on this happening soon, because ->css_released 1199 * is called from a work queue, and by busy-waiting we 1200 * might block it. So we clear iter->position right 1201 * away. 1202 */ 1203 (void)cmpxchg(&iter->position, pos, NULL); 1204 } 1205 } 1206 1207 if (pos) 1208 css = &pos->css; 1209 1210 for (;;) { 1211 css = css_next_descendant_pre(css, &root->css); 1212 if (!css) { 1213 /* 1214 * Reclaimers share the hierarchy walk, and a 1215 * new one might jump in right at the end of 1216 * the hierarchy - make sure they see at least 1217 * one group and restart from the beginning. 1218 */ 1219 if (!prev) 1220 continue; 1221 break; 1222 } 1223 1224 /* 1225 * Verify the css and acquire a reference. The root 1226 * is provided by the caller, so we know it's alive 1227 * and kicking, and don't take an extra reference. 1228 */ 1229 memcg = mem_cgroup_from_css(css); 1230 1231 if (css == &root->css) 1232 break; 1233 1234 if (css_tryget(css)) 1235 break; 1236 1237 memcg = NULL; 1238 } 1239 1240 if (reclaim) { 1241 /* 1242 * The position could have already been updated by a competing 1243 * thread, so check that the value hasn't changed since we read 1244 * it to avoid reclaiming from the same cgroup twice. 1245 */ 1246 (void)cmpxchg(&iter->position, pos, memcg); 1247 1248 if (pos) 1249 css_put(&pos->css); 1250 1251 if (!memcg) 1252 iter->generation++; 1253 else if (!prev) 1254 reclaim->generation = iter->generation; 1255 } 1256 1257out_unlock: 1258 rcu_read_unlock(); 1259out: 1260 if (prev && prev != root) 1261 css_put(&prev->css); 1262 1263 return memcg; 1264} 1265 1266/** 1267 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1268 * @root: hierarchy root 1269 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1270 */ 1271void mem_cgroup_iter_break(struct mem_cgroup *root, 1272 struct mem_cgroup *prev) 1273{ 1274 if (!root) 1275 root = root_mem_cgroup; 1276 if (prev && prev != root) 1277 css_put(&prev->css); 1278} 1279 1280static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1281 struct mem_cgroup *dead_memcg) 1282{ 1283 struct mem_cgroup_reclaim_iter *iter; 1284 struct mem_cgroup_per_node *mz; 1285 int nid; 1286 1287 for_each_node(nid) { 1288 mz = mem_cgroup_nodeinfo(from, nid); 1289 iter = &mz->iter; 1290 cmpxchg(&iter->position, dead_memcg, NULL); 1291 } 1292} 1293 1294static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1295{ 1296 struct mem_cgroup *memcg = dead_memcg; 1297 struct mem_cgroup *last; 1298 1299 do { 1300 __invalidate_reclaim_iterators(memcg, dead_memcg); 1301 last = memcg; 1302 } while ((memcg = parent_mem_cgroup(memcg))); 1303 1304 /* 1305 * When cgruop1 non-hierarchy mode is used, 1306 * parent_mem_cgroup() does not walk all the way up to the 1307 * cgroup root (root_mem_cgroup). So we have to handle 1308 * dead_memcg from cgroup root separately. 1309 */ 1310 if (last != root_mem_cgroup) 1311 __invalidate_reclaim_iterators(root_mem_cgroup, 1312 dead_memcg); 1313} 1314 1315/** 1316 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1317 * @memcg: hierarchy root 1318 * @fn: function to call for each task 1319 * @arg: argument passed to @fn 1320 * 1321 * This function iterates over tasks attached to @memcg or to any of its 1322 * descendants and calls @fn for each task. If @fn returns a non-zero 1323 * value, the function breaks the iteration loop and returns the value. 1324 * Otherwise, it will iterate over all tasks and return 0. 1325 * 1326 * This function must not be called for the root memory cgroup. 1327 */ 1328int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1329 int (*fn)(struct task_struct *, void *), void *arg) 1330{ 1331 struct mem_cgroup *iter; 1332 int ret = 0; 1333 1334 BUG_ON(memcg == root_mem_cgroup); 1335 1336 for_each_mem_cgroup_tree(iter, memcg) { 1337 struct css_task_iter it; 1338 struct task_struct *task; 1339 1340 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1341 while (!ret && (task = css_task_iter_next(&it))) 1342 ret = fn(task, arg); 1343 css_task_iter_end(&it); 1344 if (ret) { 1345 mem_cgroup_iter_break(memcg, iter); 1346 break; 1347 } 1348 } 1349 return ret; 1350} 1351 1352/** 1353 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1354 * @page: the page 1355 * @pgdat: pgdat of the page 1356 * 1357 * This function relies on page->mem_cgroup being stable - see the 1358 * access rules in commit_charge(). 1359 */ 1360struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 1361{ 1362 struct mem_cgroup_per_node *mz; 1363 struct mem_cgroup *memcg; 1364 struct lruvec *lruvec; 1365 1366 if (mem_cgroup_disabled()) { 1367 lruvec = &pgdat->__lruvec; 1368 goto out; 1369 } 1370 1371#ifdef CONFIG_HYPERHOLD_FILE_LRU 1372 if (page_is_file_lru(page) && 1373 !is_prot_page(page)) { 1374 lruvec = node_lruvec(pgdat); 1375 goto out; 1376 } 1377#endif 1378 memcg = page->mem_cgroup; 1379 /* 1380 * Swapcache readahead pages are added to the LRU - and 1381 * possibly migrated - before they are charged. 1382 */ 1383 if (!memcg) 1384 memcg = root_mem_cgroup; 1385 1386 mz = mem_cgroup_page_nodeinfo(memcg, page); 1387 lruvec = &mz->lruvec; 1388out: 1389 /* 1390 * Since a node can be onlined after the mem_cgroup was created, 1391 * we have to be prepared to initialize lruvec->zone here; 1392 * and if offlined then reonlined, we need to reinitialize it. 1393 */ 1394 if (unlikely(lruvec->pgdat != pgdat)) 1395 lruvec->pgdat = pgdat; 1396 return lruvec; 1397} 1398 1399/** 1400 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1401 * @lruvec: mem_cgroup per zone lru vector 1402 * @lru: index of lru list the page is sitting on 1403 * @zid: zone id of the accounted pages 1404 * @nr_pages: positive when adding or negative when removing 1405 * 1406 * This function must be called under lru_lock, just before a page is added 1407 * to or just after a page is removed from an lru list (that ordering being 1408 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1409 */ 1410void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1411 int zid, int nr_pages) 1412{ 1413 struct mem_cgroup_per_node *mz; 1414 unsigned long *lru_size; 1415 long size; 1416 1417 if (mem_cgroup_disabled()) 1418 return; 1419 1420#ifdef CONFIG_HYPERHOLD_FILE_LRU 1421 if (is_node_lruvec(lruvec)) 1422 return; 1423#endif 1424 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1425 lru_size = &mz->lru_zone_size[zid][lru]; 1426 1427 if (nr_pages < 0) 1428 *lru_size += nr_pages; 1429 1430 size = *lru_size; 1431 if (WARN_ONCE(size < 0, 1432 "%s(%p, %d, %d): lru_size %ld\n", 1433 __func__, lruvec, lru, nr_pages, size)) { 1434 VM_BUG_ON(1); 1435 *lru_size = 0; 1436 } 1437 1438 if (nr_pages > 0) 1439 *lru_size += nr_pages; 1440} 1441 1442/** 1443 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1444 * @memcg: the memory cgroup 1445 * 1446 * Returns the maximum amount of memory @mem can be charged with, in 1447 * pages. 1448 */ 1449static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1450{ 1451 unsigned long margin = 0; 1452 unsigned long count; 1453 unsigned long limit; 1454 1455 count = page_counter_read(&memcg->memory); 1456 limit = READ_ONCE(memcg->memory.max); 1457 if (count < limit) 1458 margin = limit - count; 1459 1460 if (do_memsw_account()) { 1461 count = page_counter_read(&memcg->memsw); 1462 limit = READ_ONCE(memcg->memsw.max); 1463 if (count < limit) 1464 margin = min(margin, limit - count); 1465 else 1466 margin = 0; 1467 } 1468 1469 return margin; 1470} 1471 1472/* 1473 * A routine for checking "mem" is under move_account() or not. 1474 * 1475 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1476 * moving cgroups. This is for waiting at high-memory pressure 1477 * caused by "move". 1478 */ 1479static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1480{ 1481 struct mem_cgroup *from; 1482 struct mem_cgroup *to; 1483 bool ret = false; 1484 /* 1485 * Unlike task_move routines, we access mc.to, mc.from not under 1486 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1487 */ 1488 spin_lock(&mc.lock); 1489 from = mc.from; 1490 to = mc.to; 1491 if (!from) 1492 goto unlock; 1493 1494 ret = mem_cgroup_is_descendant(from, memcg) || 1495 mem_cgroup_is_descendant(to, memcg); 1496unlock: 1497 spin_unlock(&mc.lock); 1498 return ret; 1499} 1500 1501static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1502{ 1503 if (mc.moving_task && current != mc.moving_task) { 1504 if (mem_cgroup_under_move(memcg)) { 1505 DEFINE_WAIT(wait); 1506 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1507 /* moving charge context might have finished. */ 1508 if (mc.moving_task) 1509 schedule(); 1510 finish_wait(&mc.waitq, &wait); 1511 return true; 1512 } 1513 } 1514 return false; 1515} 1516 1517struct memory_stat { 1518 const char *name; 1519 unsigned int ratio; 1520 unsigned int idx; 1521}; 1522 1523static struct memory_stat memory_stats[] = { 1524 { "anon", PAGE_SIZE, NR_ANON_MAPPED }, 1525 { "file", PAGE_SIZE, NR_FILE_PAGES }, 1526 { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, 1527 { "percpu", 1, MEMCG_PERCPU_B }, 1528 { "sock", PAGE_SIZE, MEMCG_SOCK }, 1529 { "shmem", PAGE_SIZE, NR_SHMEM }, 1530 { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, 1531 { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, 1532 { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, 1533#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1534 /* 1535 * The ratio will be initialized in memory_stats_init(). Because 1536 * on some architectures, the macro of HPAGE_PMD_SIZE is not 1537 * constant(e.g. powerpc). 1538 */ 1539 { "anon_thp", 0, NR_ANON_THPS }, 1540#endif 1541 { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, 1542 { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, 1543 { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, 1544 { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, 1545 { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, 1546 1547 /* 1548 * Note: The slab_reclaimable and slab_unreclaimable must be 1549 * together and slab_reclaimable must be in front. 1550 */ 1551 { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, 1552 { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, 1553 1554 /* The memory events */ 1555 { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, 1556 { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, 1557 { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, 1558 { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, 1559 { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, 1560 { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, 1561 { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, 1562}; 1563 1564static int __init memory_stats_init(void) 1565{ 1566 int i; 1567 1568 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1569#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1570 if (memory_stats[i].idx == NR_ANON_THPS) 1571 memory_stats[i].ratio = HPAGE_PMD_SIZE; 1572#endif 1573 VM_BUG_ON(!memory_stats[i].ratio); 1574 VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); 1575 } 1576 1577 return 0; 1578} 1579pure_initcall(memory_stats_init); 1580 1581static char *memory_stat_format(struct mem_cgroup *memcg) 1582{ 1583 struct seq_buf s; 1584 int i; 1585 1586 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); 1587 if (!s.buffer) 1588 return NULL; 1589 1590 /* 1591 * Provide statistics on the state of the memory subsystem as 1592 * well as cumulative event counters that show past behavior. 1593 * 1594 * This list is ordered following a combination of these gradients: 1595 * 1) generic big picture -> specifics and details 1596 * 2) reflecting userspace activity -> reflecting kernel heuristics 1597 * 1598 * Current memory state: 1599 */ 1600 1601 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1602 u64 size; 1603 1604 size = memcg_page_state(memcg, memory_stats[i].idx); 1605 size *= memory_stats[i].ratio; 1606 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 1607 1608 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1609 size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + 1610 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); 1611 seq_buf_printf(&s, "slab %llu\n", size); 1612 } 1613 } 1614 1615 /* Accumulated memory events */ 1616 1617 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), 1618 memcg_events(memcg, PGFAULT)); 1619 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), 1620 memcg_events(memcg, PGMAJFAULT)); 1621 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), 1622 memcg_events(memcg, PGREFILL)); 1623 seq_buf_printf(&s, "pgscan %lu\n", 1624 memcg_events(memcg, PGSCAN_KSWAPD) + 1625 memcg_events(memcg, PGSCAN_DIRECT)); 1626 seq_buf_printf(&s, "pgsteal %lu\n", 1627 memcg_events(memcg, PGSTEAL_KSWAPD) + 1628 memcg_events(memcg, PGSTEAL_DIRECT)); 1629 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), 1630 memcg_events(memcg, PGACTIVATE)); 1631 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), 1632 memcg_events(memcg, PGDEACTIVATE)); 1633 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), 1634 memcg_events(memcg, PGLAZYFREE)); 1635 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), 1636 memcg_events(memcg, PGLAZYFREED)); 1637 1638#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1639 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), 1640 memcg_events(memcg, THP_FAULT_ALLOC)); 1641 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), 1642 memcg_events(memcg, THP_COLLAPSE_ALLOC)); 1643#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1644 1645 /* The above should easily fit into one page */ 1646 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1647 1648 return s.buffer; 1649} 1650 1651#define K(x) ((x) << (PAGE_SHIFT-10)) 1652/** 1653 * mem_cgroup_print_oom_context: Print OOM information relevant to 1654 * memory controller. 1655 * @memcg: The memory cgroup that went over limit 1656 * @p: Task that is going to be killed 1657 * 1658 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1659 * enabled 1660 */ 1661void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1662{ 1663 rcu_read_lock(); 1664 1665 if (memcg) { 1666 pr_cont(",oom_memcg="); 1667 pr_cont_cgroup_path(memcg->css.cgroup); 1668 } else 1669 pr_cont(",global_oom"); 1670 if (p) { 1671 pr_cont(",task_memcg="); 1672 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1673 } 1674 rcu_read_unlock(); 1675} 1676 1677/** 1678 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1679 * memory controller. 1680 * @memcg: The memory cgroup that went over limit 1681 */ 1682void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1683{ 1684 char *buf; 1685 1686 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1687 K((u64)page_counter_read(&memcg->memory)), 1688 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1689 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1690 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1691 K((u64)page_counter_read(&memcg->swap)), 1692 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1693 else { 1694 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1695 K((u64)page_counter_read(&memcg->memsw)), 1696 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1697 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1698 K((u64)page_counter_read(&memcg->kmem)), 1699 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1700 } 1701 1702 pr_info("Memory cgroup stats for "); 1703 pr_cont_cgroup_path(memcg->css.cgroup); 1704 pr_cont(":"); 1705 buf = memory_stat_format(memcg); 1706 if (!buf) 1707 return; 1708 pr_info("%s", buf); 1709 kfree(buf); 1710} 1711 1712/* 1713 * Return the memory (and swap, if configured) limit for a memcg. 1714 */ 1715unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1716{ 1717 unsigned long max = READ_ONCE(memcg->memory.max); 1718 1719 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 1720 if (mem_cgroup_swappiness(memcg)) 1721 max += min(READ_ONCE(memcg->swap.max), 1722 (unsigned long)total_swap_pages); 1723 } else { /* v1 */ 1724 if (mem_cgroup_swappiness(memcg)) { 1725 /* Calculate swap excess capacity from memsw limit */ 1726 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1727 1728 max += min(swap, (unsigned long)total_swap_pages); 1729 } 1730 } 1731 return max; 1732} 1733 1734unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1735{ 1736 return page_counter_read(&memcg->memory); 1737} 1738 1739static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1740 int order) 1741{ 1742 struct oom_control oc = { 1743 .zonelist = NULL, 1744 .nodemask = NULL, 1745 .memcg = memcg, 1746 .gfp_mask = gfp_mask, 1747 .order = order, 1748 }; 1749 bool ret = true; 1750 1751 if (mutex_lock_killable(&oom_lock)) 1752 return true; 1753 1754 if (mem_cgroup_margin(memcg) >= (1 << order)) 1755 goto unlock; 1756 1757 /* 1758 * A few threads which were not waiting at mutex_lock_killable() can 1759 * fail to bail out. Therefore, check again after holding oom_lock. 1760 */ 1761 ret = task_is_dying() || out_of_memory(&oc); 1762 1763unlock: 1764 mutex_unlock(&oom_lock); 1765 return ret; 1766} 1767 1768static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1769 pg_data_t *pgdat, 1770 gfp_t gfp_mask, 1771 unsigned long *total_scanned) 1772{ 1773 struct mem_cgroup *victim = NULL; 1774 int total = 0; 1775 int loop = 0; 1776 unsigned long excess; 1777 unsigned long nr_scanned; 1778 struct mem_cgroup_reclaim_cookie reclaim = { 1779 .pgdat = pgdat, 1780 }; 1781 1782 excess = soft_limit_excess(root_memcg); 1783 1784 while (1) { 1785 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1786 if (!victim) { 1787 loop++; 1788 if (loop >= 2) { 1789 /* 1790 * If we have not been able to reclaim 1791 * anything, it might because there are 1792 * no reclaimable pages under this hierarchy 1793 */ 1794 if (!total) 1795 break; 1796 /* 1797 * We want to do more targeted reclaim. 1798 * excess >> 2 is not to excessive so as to 1799 * reclaim too much, nor too less that we keep 1800 * coming back to reclaim from this cgroup 1801 */ 1802 if (total >= (excess >> 2) || 1803 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1804 break; 1805 } 1806 continue; 1807 } 1808 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1809 pgdat, &nr_scanned); 1810 *total_scanned += nr_scanned; 1811 if (!soft_limit_excess(root_memcg)) 1812 break; 1813 } 1814 mem_cgroup_iter_break(root_memcg, victim); 1815 return total; 1816} 1817 1818#ifdef CONFIG_LOCKDEP 1819static struct lockdep_map memcg_oom_lock_dep_map = { 1820 .name = "memcg_oom_lock", 1821}; 1822#endif 1823 1824static DEFINE_SPINLOCK(memcg_oom_lock); 1825 1826/* 1827 * Check OOM-Killer is already running under our hierarchy. 1828 * If someone is running, return false. 1829 */ 1830static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1831{ 1832 struct mem_cgroup *iter, *failed = NULL; 1833 1834 spin_lock(&memcg_oom_lock); 1835 1836 for_each_mem_cgroup_tree(iter, memcg) { 1837 if (iter->oom_lock) { 1838 /* 1839 * this subtree of our hierarchy is already locked 1840 * so we cannot give a lock. 1841 */ 1842 failed = iter; 1843 mem_cgroup_iter_break(memcg, iter); 1844 break; 1845 } else 1846 iter->oom_lock = true; 1847 } 1848 1849 if (failed) { 1850 /* 1851 * OK, we failed to lock the whole subtree so we have 1852 * to clean up what we set up to the failing subtree 1853 */ 1854 for_each_mem_cgroup_tree(iter, memcg) { 1855 if (iter == failed) { 1856 mem_cgroup_iter_break(memcg, iter); 1857 break; 1858 } 1859 iter->oom_lock = false; 1860 } 1861 } else 1862 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1863 1864 spin_unlock(&memcg_oom_lock); 1865 1866 return !failed; 1867} 1868 1869static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1870{ 1871 struct mem_cgroup *iter; 1872 1873 spin_lock(&memcg_oom_lock); 1874 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1875 for_each_mem_cgroup_tree(iter, memcg) 1876 iter->oom_lock = false; 1877 spin_unlock(&memcg_oom_lock); 1878} 1879 1880static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1881{ 1882 struct mem_cgroup *iter; 1883 1884 spin_lock(&memcg_oom_lock); 1885 for_each_mem_cgroup_tree(iter, memcg) 1886 iter->under_oom++; 1887 spin_unlock(&memcg_oom_lock); 1888} 1889 1890static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1891{ 1892 struct mem_cgroup *iter; 1893 1894 /* 1895 * Be careful about under_oom underflows becase a child memcg 1896 * could have been added after mem_cgroup_mark_under_oom. 1897 */ 1898 spin_lock(&memcg_oom_lock); 1899 for_each_mem_cgroup_tree(iter, memcg) 1900 if (iter->under_oom > 0) 1901 iter->under_oom--; 1902 spin_unlock(&memcg_oom_lock); 1903} 1904 1905static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1906 1907struct oom_wait_info { 1908 struct mem_cgroup *memcg; 1909 wait_queue_entry_t wait; 1910}; 1911 1912static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1913 unsigned mode, int sync, void *arg) 1914{ 1915 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1916 struct mem_cgroup *oom_wait_memcg; 1917 struct oom_wait_info *oom_wait_info; 1918 1919 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1920 oom_wait_memcg = oom_wait_info->memcg; 1921 1922 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1923 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1924 return 0; 1925 return autoremove_wake_function(wait, mode, sync, arg); 1926} 1927 1928static void memcg_oom_recover(struct mem_cgroup *memcg) 1929{ 1930 /* 1931 * For the following lockless ->under_oom test, the only required 1932 * guarantee is that it must see the state asserted by an OOM when 1933 * this function is called as a result of userland actions 1934 * triggered by the notification of the OOM. This is trivially 1935 * achieved by invoking mem_cgroup_mark_under_oom() before 1936 * triggering notification. 1937 */ 1938 if (memcg && memcg->under_oom) 1939 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1940} 1941 1942enum oom_status { 1943 OOM_SUCCESS, 1944 OOM_FAILED, 1945 OOM_ASYNC, 1946 OOM_SKIPPED 1947}; 1948 1949static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1950{ 1951 enum oom_status ret; 1952 bool locked; 1953 1954 if (order > PAGE_ALLOC_COSTLY_ORDER) 1955 return OOM_SKIPPED; 1956 1957 memcg_memory_event(memcg, MEMCG_OOM); 1958 1959 /* 1960 * We are in the middle of the charge context here, so we 1961 * don't want to block when potentially sitting on a callstack 1962 * that holds all kinds of filesystem and mm locks. 1963 * 1964 * cgroup1 allows disabling the OOM killer and waiting for outside 1965 * handling until the charge can succeed; remember the context and put 1966 * the task to sleep at the end of the page fault when all locks are 1967 * released. 1968 * 1969 * On the other hand, in-kernel OOM killer allows for an async victim 1970 * memory reclaim (oom_reaper) and that means that we are not solely 1971 * relying on the oom victim to make a forward progress and we can 1972 * invoke the oom killer here. 1973 * 1974 * Please note that mem_cgroup_out_of_memory might fail to find a 1975 * victim and then we have to bail out from the charge path. 1976 */ 1977 if (memcg->oom_kill_disable) { 1978 if (!current->in_user_fault) 1979 return OOM_SKIPPED; 1980 css_get(&memcg->css); 1981 current->memcg_in_oom = memcg; 1982 current->memcg_oom_gfp_mask = mask; 1983 current->memcg_oom_order = order; 1984 1985 return OOM_ASYNC; 1986 } 1987 1988 mem_cgroup_mark_under_oom(memcg); 1989 1990 locked = mem_cgroup_oom_trylock(memcg); 1991 1992 if (locked) 1993 mem_cgroup_oom_notify(memcg); 1994 1995 mem_cgroup_unmark_under_oom(memcg); 1996 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1997 ret = OOM_SUCCESS; 1998 else 1999 ret = OOM_FAILED; 2000 2001 if (locked) 2002 mem_cgroup_oom_unlock(memcg); 2003 2004 return ret; 2005} 2006 2007/** 2008 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2009 * @handle: actually kill/wait or just clean up the OOM state 2010 * 2011 * This has to be called at the end of a page fault if the memcg OOM 2012 * handler was enabled. 2013 * 2014 * Memcg supports userspace OOM handling where failed allocations must 2015 * sleep on a waitqueue until the userspace task resolves the 2016 * situation. Sleeping directly in the charge context with all kinds 2017 * of locks held is not a good idea, instead we remember an OOM state 2018 * in the task and mem_cgroup_oom_synchronize() has to be called at 2019 * the end of the page fault to complete the OOM handling. 2020 * 2021 * Returns %true if an ongoing memcg OOM situation was detected and 2022 * completed, %false otherwise. 2023 */ 2024bool mem_cgroup_oom_synchronize(bool handle) 2025{ 2026 struct mem_cgroup *memcg = current->memcg_in_oom; 2027 struct oom_wait_info owait; 2028 bool locked; 2029 2030 /* OOM is global, do not handle */ 2031 if (!memcg) 2032 return false; 2033 2034 if (!handle) 2035 goto cleanup; 2036 2037 owait.memcg = memcg; 2038 owait.wait.flags = 0; 2039 owait.wait.func = memcg_oom_wake_function; 2040 owait.wait.private = current; 2041 INIT_LIST_HEAD(&owait.wait.entry); 2042 2043 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2044 mem_cgroup_mark_under_oom(memcg); 2045 2046 locked = mem_cgroup_oom_trylock(memcg); 2047 2048 if (locked) 2049 mem_cgroup_oom_notify(memcg); 2050 2051 if (locked && !memcg->oom_kill_disable) { 2052 mem_cgroup_unmark_under_oom(memcg); 2053 finish_wait(&memcg_oom_waitq, &owait.wait); 2054 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 2055 current->memcg_oom_order); 2056 } else { 2057 schedule(); 2058 mem_cgroup_unmark_under_oom(memcg); 2059 finish_wait(&memcg_oom_waitq, &owait.wait); 2060 } 2061 2062 if (locked) { 2063 mem_cgroup_oom_unlock(memcg); 2064 /* 2065 * There is no guarantee that an OOM-lock contender 2066 * sees the wakeups triggered by the OOM kill 2067 * uncharges. Wake any sleepers explicitely. 2068 */ 2069 memcg_oom_recover(memcg); 2070 } 2071cleanup: 2072 current->memcg_in_oom = NULL; 2073 css_put(&memcg->css); 2074 return true; 2075} 2076 2077/** 2078 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2079 * @victim: task to be killed by the OOM killer 2080 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2081 * 2082 * Returns a pointer to a memory cgroup, which has to be cleaned up 2083 * by killing all belonging OOM-killable tasks. 2084 * 2085 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2086 */ 2087struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2088 struct mem_cgroup *oom_domain) 2089{ 2090 struct mem_cgroup *oom_group = NULL; 2091 struct mem_cgroup *memcg; 2092 2093 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2094 return NULL; 2095 2096 if (!oom_domain) 2097 oom_domain = root_mem_cgroup; 2098 2099 rcu_read_lock(); 2100 2101 memcg = mem_cgroup_from_task(victim); 2102 if (memcg == root_mem_cgroup) 2103 goto out; 2104 2105 /* 2106 * If the victim task has been asynchronously moved to a different 2107 * memory cgroup, we might end up killing tasks outside oom_domain. 2108 * In this case it's better to ignore memory.group.oom. 2109 */ 2110 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 2111 goto out; 2112 2113 /* 2114 * Traverse the memory cgroup hierarchy from the victim task's 2115 * cgroup up to the OOMing cgroup (or root) to find the 2116 * highest-level memory cgroup with oom.group set. 2117 */ 2118 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2119 if (memcg->oom_group) 2120 oom_group = memcg; 2121 2122 if (memcg == oom_domain) 2123 break; 2124 } 2125 2126 if (oom_group) 2127 css_get(&oom_group->css); 2128out: 2129 rcu_read_unlock(); 2130 2131 return oom_group; 2132} 2133 2134void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2135{ 2136 pr_info("Tasks in "); 2137 pr_cont_cgroup_path(memcg->css.cgroup); 2138 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2139} 2140 2141/** 2142 * lock_page_memcg - lock a page->mem_cgroup binding 2143 * @page: the page 2144 * 2145 * This function protects unlocked LRU pages from being moved to 2146 * another cgroup. 2147 * 2148 * It ensures lifetime of the returned memcg. Caller is responsible 2149 * for the lifetime of the page; __unlock_page_memcg() is available 2150 * when @page might get freed inside the locked section. 2151 */ 2152struct mem_cgroup *lock_page_memcg(struct page *page) 2153{ 2154 struct page *head = compound_head(page); /* rmap on tail pages */ 2155 struct mem_cgroup *memcg; 2156 unsigned long flags; 2157 2158 /* 2159 * The RCU lock is held throughout the transaction. The fast 2160 * path can get away without acquiring the memcg->move_lock 2161 * because page moving starts with an RCU grace period. 2162 * 2163 * The RCU lock also protects the memcg from being freed when 2164 * the page state that is going to change is the only thing 2165 * preventing the page itself from being freed. E.g. writeback 2166 * doesn't hold a page reference and relies on PG_writeback to 2167 * keep off truncation, migration and so forth. 2168 */ 2169 rcu_read_lock(); 2170 2171 if (mem_cgroup_disabled()) 2172 return NULL; 2173again: 2174 memcg = head->mem_cgroup; 2175 if (unlikely(!memcg)) 2176 return NULL; 2177 2178 if (atomic_read(&memcg->moving_account) <= 0) 2179 return memcg; 2180 2181 spin_lock_irqsave(&memcg->move_lock, flags); 2182 if (memcg != head->mem_cgroup) { 2183 spin_unlock_irqrestore(&memcg->move_lock, flags); 2184 goto again; 2185 } 2186 2187 /* 2188 * When charge migration first begins, we can have locked and 2189 * unlocked page stat updates happening concurrently. Track 2190 * the task who has the lock for unlock_page_memcg(). 2191 */ 2192 memcg->move_lock_task = current; 2193 memcg->move_lock_flags = flags; 2194 2195 return memcg; 2196} 2197EXPORT_SYMBOL(lock_page_memcg); 2198 2199/** 2200 * __unlock_page_memcg - unlock and unpin a memcg 2201 * @memcg: the memcg 2202 * 2203 * Unlock and unpin a memcg returned by lock_page_memcg(). 2204 */ 2205void __unlock_page_memcg(struct mem_cgroup *memcg) 2206{ 2207 if (memcg && memcg->move_lock_task == current) { 2208 unsigned long flags = memcg->move_lock_flags; 2209 2210 memcg->move_lock_task = NULL; 2211 memcg->move_lock_flags = 0; 2212 2213 spin_unlock_irqrestore(&memcg->move_lock, flags); 2214 } 2215 2216 rcu_read_unlock(); 2217} 2218 2219/** 2220 * unlock_page_memcg - unlock a page->mem_cgroup binding 2221 * @page: the page 2222 */ 2223void unlock_page_memcg(struct page *page) 2224{ 2225 struct page *head = compound_head(page); 2226 2227 __unlock_page_memcg(head->mem_cgroup); 2228} 2229EXPORT_SYMBOL(unlock_page_memcg); 2230 2231struct memcg_stock_pcp { 2232 struct mem_cgroup *cached; /* this never be root cgroup */ 2233 unsigned int nr_pages; 2234 2235#ifdef CONFIG_MEMCG_KMEM 2236 struct obj_cgroup *cached_objcg; 2237 unsigned int nr_bytes; 2238#endif 2239 2240 struct work_struct work; 2241 unsigned long flags; 2242#define FLUSHING_CACHED_CHARGE 0 2243}; 2244static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2245static DEFINE_MUTEX(percpu_charge_mutex); 2246 2247#ifdef CONFIG_MEMCG_KMEM 2248static void drain_obj_stock(struct memcg_stock_pcp *stock); 2249static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2250 struct mem_cgroup *root_memcg); 2251 2252#else 2253static inline void drain_obj_stock(struct memcg_stock_pcp *stock) 2254{ 2255} 2256static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2257 struct mem_cgroup *root_memcg) 2258{ 2259 return false; 2260} 2261#endif 2262 2263/** 2264 * consume_stock: Try to consume stocked charge on this cpu. 2265 * @memcg: memcg to consume from. 2266 * @nr_pages: how many pages to charge. 2267 * 2268 * The charges will only happen if @memcg matches the current cpu's memcg 2269 * stock, and at least @nr_pages are available in that stock. Failure to 2270 * service an allocation will refill the stock. 2271 * 2272 * returns true if successful, false otherwise. 2273 */ 2274static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2275{ 2276 struct memcg_stock_pcp *stock; 2277 unsigned long flags; 2278 bool ret = false; 2279 2280 if (nr_pages > MEMCG_CHARGE_BATCH) 2281 return ret; 2282 2283 local_irq_save(flags); 2284 2285 stock = this_cpu_ptr(&memcg_stock); 2286 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2287 stock->nr_pages -= nr_pages; 2288 ret = true; 2289 } 2290 2291 local_irq_restore(flags); 2292 2293 return ret; 2294} 2295 2296/* 2297 * Returns stocks cached in percpu and reset cached information. 2298 */ 2299static void drain_stock(struct memcg_stock_pcp *stock) 2300{ 2301 struct mem_cgroup *old = stock->cached; 2302 2303 if (!old) 2304 return; 2305 2306 if (stock->nr_pages) { 2307 page_counter_uncharge(&old->memory, stock->nr_pages); 2308 if (do_memsw_account()) 2309 page_counter_uncharge(&old->memsw, stock->nr_pages); 2310 stock->nr_pages = 0; 2311 } 2312 2313 css_put(&old->css); 2314 stock->cached = NULL; 2315} 2316 2317static void drain_local_stock(struct work_struct *dummy) 2318{ 2319 struct memcg_stock_pcp *stock; 2320 unsigned long flags; 2321 2322 /* 2323 * The only protection from memory hotplug vs. drain_stock races is 2324 * that we always operate on local CPU stock here with IRQ disabled 2325 */ 2326 local_irq_save(flags); 2327 2328 stock = this_cpu_ptr(&memcg_stock); 2329 drain_obj_stock(stock); 2330 drain_stock(stock); 2331 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2332 2333 local_irq_restore(flags); 2334} 2335 2336/* 2337 * Cache charges(val) to local per_cpu area. 2338 * This will be consumed by consume_stock() function, later. 2339 */ 2340static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2341{ 2342 struct memcg_stock_pcp *stock; 2343 unsigned long flags; 2344 2345 local_irq_save(flags); 2346 2347 stock = this_cpu_ptr(&memcg_stock); 2348 if (stock->cached != memcg) { /* reset if necessary */ 2349 drain_stock(stock); 2350 css_get(&memcg->css); 2351 stock->cached = memcg; 2352 } 2353 stock->nr_pages += nr_pages; 2354 2355 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2356 drain_stock(stock); 2357 2358 local_irq_restore(flags); 2359} 2360 2361/* 2362 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2363 * of the hierarchy under it. 2364 */ 2365static void drain_all_stock(struct mem_cgroup *root_memcg) 2366{ 2367 int cpu, curcpu; 2368 2369 /* If someone's already draining, avoid adding running more workers. */ 2370 if (!mutex_trylock(&percpu_charge_mutex)) 2371 return; 2372 /* 2373 * Notify other cpus that system-wide "drain" is running 2374 * We do not care about races with the cpu hotplug because cpu down 2375 * as well as workers from this path always operate on the local 2376 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2377 */ 2378 curcpu = get_cpu(); 2379 for_each_online_cpu(cpu) { 2380 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2381 struct mem_cgroup *memcg; 2382 bool flush = false; 2383 2384 rcu_read_lock(); 2385 memcg = stock->cached; 2386 if (memcg && stock->nr_pages && 2387 mem_cgroup_is_descendant(memcg, root_memcg)) 2388 flush = true; 2389 if (obj_stock_flush_required(stock, root_memcg)) 2390 flush = true; 2391 rcu_read_unlock(); 2392 2393 if (flush && 2394 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2395 if (cpu == curcpu) 2396 drain_local_stock(&stock->work); 2397 else 2398 schedule_work_on(cpu, &stock->work); 2399 } 2400 } 2401 put_cpu(); 2402 mutex_unlock(&percpu_charge_mutex); 2403} 2404 2405static int memcg_hotplug_cpu_dead(unsigned int cpu) 2406{ 2407 struct memcg_stock_pcp *stock; 2408 struct mem_cgroup *memcg, *mi; 2409 2410 stock = &per_cpu(memcg_stock, cpu); 2411 drain_stock(stock); 2412 2413 for_each_mem_cgroup(memcg) { 2414 int i; 2415 2416 for (i = 0; i < MEMCG_NR_STAT; i++) { 2417 int nid; 2418 long x; 2419 2420 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); 2421 if (x) 2422 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2423 atomic_long_add(x, &memcg->vmstats[i]); 2424 2425 if (i >= NR_VM_NODE_STAT_ITEMS) 2426 continue; 2427 2428 for_each_node(nid) { 2429 struct mem_cgroup_per_node *pn; 2430 2431 pn = mem_cgroup_nodeinfo(memcg, nid); 2432 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); 2433 if (x) 2434 do { 2435 atomic_long_add(x, &pn->lruvec_stat[i]); 2436 } while ((pn = parent_nodeinfo(pn, nid))); 2437 } 2438 } 2439 2440 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 2441 long x; 2442 2443 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); 2444 if (x) 2445 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2446 atomic_long_add(x, &memcg->vmevents[i]); 2447 } 2448 } 2449 2450 return 0; 2451} 2452 2453static unsigned long reclaim_high(struct mem_cgroup *memcg, 2454 unsigned int nr_pages, 2455 gfp_t gfp_mask) 2456{ 2457 unsigned long nr_reclaimed = 0; 2458 2459 do { 2460 unsigned long pflags; 2461 2462 if (page_counter_read(&memcg->memory) <= 2463 READ_ONCE(memcg->memory.high)) 2464 continue; 2465 2466 memcg_memory_event(memcg, MEMCG_HIGH); 2467 2468 psi_memstall_enter(&pflags); 2469 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2470 gfp_mask, true); 2471 psi_memstall_leave(&pflags); 2472 } while ((memcg = parent_mem_cgroup(memcg)) && 2473 !mem_cgroup_is_root(memcg)); 2474 2475 return nr_reclaimed; 2476} 2477 2478static void high_work_func(struct work_struct *work) 2479{ 2480 struct mem_cgroup *memcg; 2481 2482 memcg = container_of(work, struct mem_cgroup, high_work); 2483 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2484} 2485 2486/* 2487 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2488 * enough to still cause a significant slowdown in most cases, while still 2489 * allowing diagnostics and tracing to proceed without becoming stuck. 2490 */ 2491#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2492 2493/* 2494 * When calculating the delay, we use these either side of the exponentiation to 2495 * maintain precision and scale to a reasonable number of jiffies (see the table 2496 * below. 2497 * 2498 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2499 * overage ratio to a delay. 2500 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2501 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2502 * to produce a reasonable delay curve. 2503 * 2504 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2505 * reasonable delay curve compared to precision-adjusted overage, not 2506 * penalising heavily at first, but still making sure that growth beyond the 2507 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2508 * example, with a high of 100 megabytes: 2509 * 2510 * +-------+------------------------+ 2511 * | usage | time to allocate in ms | 2512 * +-------+------------------------+ 2513 * | 100M | 0 | 2514 * | 101M | 6 | 2515 * | 102M | 25 | 2516 * | 103M | 57 | 2517 * | 104M | 102 | 2518 * | 105M | 159 | 2519 * | 106M | 230 | 2520 * | 107M | 313 | 2521 * | 108M | 409 | 2522 * | 109M | 518 | 2523 * | 110M | 639 | 2524 * | 111M | 774 | 2525 * | 112M | 921 | 2526 * | 113M | 1081 | 2527 * | 114M | 1254 | 2528 * | 115M | 1439 | 2529 * | 116M | 1638 | 2530 * | 117M | 1849 | 2531 * | 118M | 2000 | 2532 * | 119M | 2000 | 2533 * | 120M | 2000 | 2534 * +-------+------------------------+ 2535 */ 2536 #define MEMCG_DELAY_PRECISION_SHIFT 20 2537 #define MEMCG_DELAY_SCALING_SHIFT 14 2538 2539static u64 calculate_overage(unsigned long usage, unsigned long high) 2540{ 2541 u64 overage; 2542 2543 if (usage <= high) 2544 return 0; 2545 2546 /* 2547 * Prevent division by 0 in overage calculation by acting as if 2548 * it was a threshold of 1 page 2549 */ 2550 high = max(high, 1UL); 2551 2552 overage = usage - high; 2553 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2554 return div64_u64(overage, high); 2555} 2556 2557static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2558{ 2559 u64 overage, max_overage = 0; 2560 2561 do { 2562 overage = calculate_overage(page_counter_read(&memcg->memory), 2563 READ_ONCE(memcg->memory.high)); 2564 max_overage = max(overage, max_overage); 2565 } while ((memcg = parent_mem_cgroup(memcg)) && 2566 !mem_cgroup_is_root(memcg)); 2567 2568 return max_overage; 2569} 2570 2571static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2572{ 2573 u64 overage, max_overage = 0; 2574 2575 do { 2576 overage = calculate_overage(page_counter_read(&memcg->swap), 2577 READ_ONCE(memcg->swap.high)); 2578 if (overage) 2579 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2580 max_overage = max(overage, max_overage); 2581 } while ((memcg = parent_mem_cgroup(memcg)) && 2582 !mem_cgroup_is_root(memcg)); 2583 2584 return max_overage; 2585} 2586 2587/* 2588 * Get the number of jiffies that we should penalise a mischievous cgroup which 2589 * is exceeding its memory.high by checking both it and its ancestors. 2590 */ 2591static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2592 unsigned int nr_pages, 2593 u64 max_overage) 2594{ 2595 unsigned long penalty_jiffies; 2596 2597 if (!max_overage) 2598 return 0; 2599 2600 /* 2601 * We use overage compared to memory.high to calculate the number of 2602 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2603 * fairly lenient on small overages, and increasingly harsh when the 2604 * memcg in question makes it clear that it has no intention of stopping 2605 * its crazy behaviour, so we exponentially increase the delay based on 2606 * overage amount. 2607 */ 2608 penalty_jiffies = max_overage * max_overage * HZ; 2609 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2610 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2611 2612 /* 2613 * Factor in the task's own contribution to the overage, such that four 2614 * N-sized allocations are throttled approximately the same as one 2615 * 4N-sized allocation. 2616 * 2617 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2618 * larger the current charge patch is than that. 2619 */ 2620 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2621} 2622 2623/* 2624 * Scheduled by try_charge() to be executed from the userland return path 2625 * and reclaims memory over the high limit. 2626 */ 2627void mem_cgroup_handle_over_high(void) 2628{ 2629 unsigned long penalty_jiffies; 2630 unsigned long pflags; 2631 unsigned long nr_reclaimed; 2632 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2633 int nr_retries = MAX_RECLAIM_RETRIES; 2634 struct mem_cgroup *memcg; 2635 bool in_retry = false; 2636 2637 if (likely(!nr_pages)) 2638 return; 2639 2640 memcg = get_mem_cgroup_from_mm(current->mm); 2641 current->memcg_nr_pages_over_high = 0; 2642 2643retry_reclaim: 2644 /* 2645 * The allocating task should reclaim at least the batch size, but for 2646 * subsequent retries we only want to do what's necessary to prevent oom 2647 * or breaching resource isolation. 2648 * 2649 * This is distinct from memory.max or page allocator behaviour because 2650 * memory.high is currently batched, whereas memory.max and the page 2651 * allocator run every time an allocation is made. 2652 */ 2653 nr_reclaimed = reclaim_high(memcg, 2654 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2655 GFP_KERNEL); 2656 2657 /* 2658 * memory.high is breached and reclaim is unable to keep up. Throttle 2659 * allocators proactively to slow down excessive growth. 2660 */ 2661 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2662 mem_find_max_overage(memcg)); 2663 2664 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2665 swap_find_max_overage(memcg)); 2666 2667 /* 2668 * Clamp the max delay per usermode return so as to still keep the 2669 * application moving forwards and also permit diagnostics, albeit 2670 * extremely slowly. 2671 */ 2672 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2673 2674 /* 2675 * Don't sleep if the amount of jiffies this memcg owes us is so low 2676 * that it's not even worth doing, in an attempt to be nice to those who 2677 * go only a small amount over their memory.high value and maybe haven't 2678 * been aggressively reclaimed enough yet. 2679 */ 2680 if (penalty_jiffies <= HZ / 100) 2681 goto out; 2682 2683 /* 2684 * If reclaim is making forward progress but we're still over 2685 * memory.high, we want to encourage that rather than doing allocator 2686 * throttling. 2687 */ 2688 if (nr_reclaimed || nr_retries--) { 2689 in_retry = true; 2690 goto retry_reclaim; 2691 } 2692 2693 /* 2694 * If we exit early, we're guaranteed to die (since 2695 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2696 * need to account for any ill-begotten jiffies to pay them off later. 2697 */ 2698 psi_memstall_enter(&pflags); 2699 schedule_timeout_killable(penalty_jiffies); 2700 psi_memstall_leave(&pflags); 2701 2702out: 2703 css_put(&memcg->css); 2704} 2705 2706static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2707 unsigned int nr_pages) 2708{ 2709 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2710 int nr_retries = MAX_RECLAIM_RETRIES; 2711 struct mem_cgroup *mem_over_limit; 2712 struct page_counter *counter; 2713 enum oom_status oom_status; 2714 unsigned long nr_reclaimed; 2715 bool passed_oom = false; 2716 bool may_swap = true; 2717 bool drained = false; 2718 unsigned long pflags; 2719 2720 if (mem_cgroup_is_root(memcg)) 2721 return 0; 2722retry: 2723 if (consume_stock(memcg, nr_pages)) 2724 return 0; 2725 2726 if (!do_memsw_account() || 2727 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2728 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2729 goto done_restock; 2730 if (do_memsw_account()) 2731 page_counter_uncharge(&memcg->memsw, batch); 2732 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2733 } else { 2734 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2735 may_swap = false; 2736 } 2737 2738 if (batch > nr_pages) { 2739 batch = nr_pages; 2740 goto retry; 2741 } 2742 2743 /* 2744 * Memcg doesn't have a dedicated reserve for atomic 2745 * allocations. But like the global atomic pool, we need to 2746 * put the burden of reclaim on regular allocation requests 2747 * and let these go through as privileged allocations. 2748 */ 2749 if (gfp_mask & __GFP_ATOMIC) 2750 goto force; 2751 2752 /* 2753 * Prevent unbounded recursion when reclaim operations need to 2754 * allocate memory. This might exceed the limits temporarily, 2755 * but we prefer facilitating memory reclaim and getting back 2756 * under the limit over triggering OOM kills in these cases. 2757 */ 2758 if (unlikely(current->flags & PF_MEMALLOC)) 2759 goto force; 2760 2761 if (unlikely(task_in_memcg_oom(current))) 2762 goto nomem; 2763 2764 if (!gfpflags_allow_blocking(gfp_mask)) 2765 goto nomem; 2766 2767 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2768 2769 psi_memstall_enter(&pflags); 2770 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2771 gfp_mask, may_swap); 2772 psi_memstall_leave(&pflags); 2773 2774 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2775 goto retry; 2776 2777 if (!drained) { 2778 drain_all_stock(mem_over_limit); 2779 drained = true; 2780 goto retry; 2781 } 2782 2783 if (gfp_mask & __GFP_NORETRY) 2784 goto nomem; 2785 /* 2786 * Even though the limit is exceeded at this point, reclaim 2787 * may have been able to free some pages. Retry the charge 2788 * before killing the task. 2789 * 2790 * Only for regular pages, though: huge pages are rather 2791 * unlikely to succeed so close to the limit, and we fall back 2792 * to regular pages anyway in case of failure. 2793 */ 2794 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2795 goto retry; 2796 /* 2797 * At task move, charge accounts can be doubly counted. So, it's 2798 * better to wait until the end of task_move if something is going on. 2799 */ 2800 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2801 goto retry; 2802 2803 if (nr_retries--) 2804 goto retry; 2805 2806 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2807 goto nomem; 2808 2809 if (gfp_mask & __GFP_NOFAIL) 2810 goto force; 2811 2812 /* Avoid endless loop for tasks bypassed by the oom killer */ 2813 if (passed_oom && task_is_dying()) 2814 goto nomem; 2815 2816 /* 2817 * keep retrying as long as the memcg oom killer is able to make 2818 * a forward progress or bypass the charge if the oom killer 2819 * couldn't make any progress. 2820 */ 2821 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 2822 get_order(nr_pages * PAGE_SIZE)); 2823 if (oom_status == OOM_SUCCESS) { 2824 passed_oom = true; 2825 nr_retries = MAX_RECLAIM_RETRIES; 2826 goto retry; 2827 } 2828nomem: 2829 if (!(gfp_mask & __GFP_NOFAIL)) 2830 return -ENOMEM; 2831force: 2832 /* 2833 * The allocation either can't fail or will lead to more memory 2834 * being freed very soon. Allow memory usage go over the limit 2835 * temporarily by force charging it. 2836 */ 2837 page_counter_charge(&memcg->memory, nr_pages); 2838 if (do_memsw_account()) 2839 page_counter_charge(&memcg->memsw, nr_pages); 2840 2841 return 0; 2842 2843done_restock: 2844 if (batch > nr_pages) 2845 refill_stock(memcg, batch - nr_pages); 2846 2847 /* 2848 * If the hierarchy is above the normal consumption range, schedule 2849 * reclaim on returning to userland. We can perform reclaim here 2850 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2851 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2852 * not recorded as it most likely matches current's and won't 2853 * change in the meantime. As high limit is checked again before 2854 * reclaim, the cost of mismatch is negligible. 2855 */ 2856 do { 2857 bool mem_high, swap_high; 2858 2859 mem_high = page_counter_read(&memcg->memory) > 2860 READ_ONCE(memcg->memory.high); 2861 swap_high = page_counter_read(&memcg->swap) > 2862 READ_ONCE(memcg->swap.high); 2863 2864 /* Don't bother a random interrupted task */ 2865 if (in_interrupt()) { 2866 if (mem_high) { 2867 schedule_work(&memcg->high_work); 2868 break; 2869 } 2870 continue; 2871 } 2872 2873 if (mem_high || swap_high) { 2874 /* 2875 * The allocating tasks in this cgroup will need to do 2876 * reclaim or be throttled to prevent further growth 2877 * of the memory or swap footprints. 2878 * 2879 * Target some best-effort fairness between the tasks, 2880 * and distribute reclaim work and delay penalties 2881 * based on how much each task is actually allocating. 2882 */ 2883 current->memcg_nr_pages_over_high += batch; 2884 set_notify_resume(current); 2885 break; 2886 } 2887 } while ((memcg = parent_mem_cgroup(memcg))); 2888 2889 return 0; 2890} 2891 2892#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) 2893static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2894{ 2895 if (mem_cgroup_is_root(memcg)) 2896 return; 2897 2898 page_counter_uncharge(&memcg->memory, nr_pages); 2899 if (do_memsw_account()) 2900 page_counter_uncharge(&memcg->memsw, nr_pages); 2901} 2902#endif 2903 2904static void commit_charge(struct page *page, struct mem_cgroup *memcg) 2905{ 2906 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2907 /* 2908 * Any of the following ensures page->mem_cgroup stability: 2909 * 2910 * - the page lock 2911 * - LRU isolation 2912 * - lock_page_memcg() 2913 * - exclusive reference 2914 */ 2915 page->mem_cgroup = memcg; 2916} 2917 2918#ifdef CONFIG_MEMCG_KMEM 2919/* 2920 * The allocated objcg pointers array is not accounted directly. 2921 * Moreover, it should not come from DMA buffer and is not readily 2922 * reclaimable. So those GFP bits should be masked off. 2923 */ 2924#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ 2925 __GFP_ACCOUNT | __GFP_NOFAIL) 2926 2927int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, 2928 gfp_t gfp) 2929{ 2930 unsigned int objects = objs_per_slab_page(s, page); 2931 void *vec; 2932 2933 gfp &= ~OBJCGS_CLEAR_MASK; 2934 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2935 page_to_nid(page)); 2936 if (!vec) 2937 return -ENOMEM; 2938 2939 if (cmpxchg(&page->obj_cgroups, NULL, 2940 (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) 2941 kfree(vec); 2942 else 2943 kmemleak_not_leak(vec); 2944 2945 return 0; 2946} 2947 2948/* 2949 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2950 * 2951 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2952 * cgroup_mutex, etc. 2953 */ 2954struct mem_cgroup *mem_cgroup_from_obj(void *p) 2955{ 2956 struct page *page; 2957 2958 if (mem_cgroup_disabled()) 2959 return NULL; 2960 2961 page = virt_to_head_page(p); 2962 2963 /* 2964 * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer 2965 * or a pointer to obj_cgroup vector. In the latter case the lowest 2966 * bit of the pointer is set. 2967 * The page->mem_cgroup pointer can be asynchronously changed 2968 * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed 2969 * from a valid memcg pointer to objcg vector or back. 2970 */ 2971 if (!page->mem_cgroup) 2972 return NULL; 2973 2974 /* 2975 * Slab objects are accounted individually, not per-page. 2976 * Memcg membership data for each individual object is saved in 2977 * the page->obj_cgroups. 2978 */ 2979 if (page_has_obj_cgroups(page)) { 2980 struct obj_cgroup *objcg; 2981 unsigned int off; 2982 2983 off = obj_to_index(page->slab_cache, page, p); 2984 objcg = page_obj_cgroups(page)[off]; 2985 if (objcg) 2986 return obj_cgroup_memcg(objcg); 2987 2988 return NULL; 2989 } 2990 2991 /* All other pages use page->mem_cgroup */ 2992 return page->mem_cgroup; 2993} 2994 2995__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 2996{ 2997 struct obj_cgroup *objcg = NULL; 2998 struct mem_cgroup *memcg; 2999 3000 if (memcg_kmem_bypass()) 3001 return NULL; 3002 3003 rcu_read_lock(); 3004 if (unlikely(active_memcg())) 3005 memcg = active_memcg(); 3006 else 3007 memcg = mem_cgroup_from_task(current); 3008 3009 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 3010 objcg = rcu_dereference(memcg->objcg); 3011 if (objcg && obj_cgroup_tryget(objcg)) 3012 break; 3013 objcg = NULL; 3014 } 3015 rcu_read_unlock(); 3016 3017 return objcg; 3018} 3019 3020static int memcg_alloc_cache_id(void) 3021{ 3022 int id, size; 3023 int err; 3024 3025 id = ida_simple_get(&memcg_cache_ida, 3026 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 3027 if (id < 0) 3028 return id; 3029 3030 if (id < memcg_nr_cache_ids) 3031 return id; 3032 3033 /* 3034 * There's no space for the new id in memcg_caches arrays, 3035 * so we have to grow them. 3036 */ 3037 down_write(&memcg_cache_ids_sem); 3038 3039 size = 2 * (id + 1); 3040 if (size < MEMCG_CACHES_MIN_SIZE) 3041 size = MEMCG_CACHES_MIN_SIZE; 3042 else if (size > MEMCG_CACHES_MAX_SIZE) 3043 size = MEMCG_CACHES_MAX_SIZE; 3044 3045 err = memcg_update_all_list_lrus(size); 3046 if (!err) 3047 memcg_nr_cache_ids = size; 3048 3049 up_write(&memcg_cache_ids_sem); 3050 3051 if (err) { 3052 ida_simple_remove(&memcg_cache_ida, id); 3053 return err; 3054 } 3055 return id; 3056} 3057 3058static void memcg_free_cache_id(int id) 3059{ 3060 ida_simple_remove(&memcg_cache_ida, id); 3061} 3062 3063/** 3064 * __memcg_kmem_charge: charge a number of kernel pages to a memcg 3065 * @memcg: memory cgroup to charge 3066 * @gfp: reclaim mode 3067 * @nr_pages: number of pages to charge 3068 * 3069 * Returns 0 on success, an error code on failure. 3070 */ 3071int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, 3072 unsigned int nr_pages) 3073{ 3074 struct page_counter *counter; 3075 int ret; 3076 3077 ret = try_charge(memcg, gfp, nr_pages); 3078 if (ret) 3079 return ret; 3080 3081 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 3082 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 3083 3084 /* 3085 * Enforce __GFP_NOFAIL allocation because callers are not 3086 * prepared to see failures and likely do not have any failure 3087 * handling code. 3088 */ 3089 if (gfp & __GFP_NOFAIL) { 3090 page_counter_charge(&memcg->kmem, nr_pages); 3091 return 0; 3092 } 3093 cancel_charge(memcg, nr_pages); 3094 return -ENOMEM; 3095 } 3096 return 0; 3097} 3098 3099/** 3100 * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg 3101 * @memcg: memcg to uncharge 3102 * @nr_pages: number of pages to uncharge 3103 */ 3104void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) 3105{ 3106 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 3107 page_counter_uncharge(&memcg->kmem, nr_pages); 3108 3109 refill_stock(memcg, nr_pages); 3110} 3111 3112/** 3113 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3114 * @page: page to charge 3115 * @gfp: reclaim mode 3116 * @order: allocation order 3117 * 3118 * Returns 0 on success, an error code on failure. 3119 */ 3120int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3121{ 3122 struct mem_cgroup *memcg; 3123 int ret = 0; 3124 3125 memcg = get_mem_cgroup_from_current(); 3126 if (memcg && !mem_cgroup_is_root(memcg)) { 3127 ret = __memcg_kmem_charge(memcg, gfp, 1 << order); 3128 if (!ret) { 3129 page->mem_cgroup = memcg; 3130 __SetPageKmemcg(page); 3131 return 0; 3132 } 3133 css_put(&memcg->css); 3134 } 3135 return ret; 3136} 3137 3138/** 3139 * __memcg_kmem_uncharge_page: uncharge a kmem page 3140 * @page: page to uncharge 3141 * @order: allocation order 3142 */ 3143void __memcg_kmem_uncharge_page(struct page *page, int order) 3144{ 3145 struct mem_cgroup *memcg = page->mem_cgroup; 3146 unsigned int nr_pages = 1 << order; 3147 3148 if (!memcg) 3149 return; 3150 3151 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3152 __memcg_kmem_uncharge(memcg, nr_pages); 3153 page->mem_cgroup = NULL; 3154 css_put(&memcg->css); 3155 3156 /* slab pages do not have PageKmemcg flag set */ 3157 if (PageKmemcg(page)) 3158 __ClearPageKmemcg(page); 3159} 3160 3161static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3162{ 3163 struct memcg_stock_pcp *stock; 3164 unsigned long flags; 3165 bool ret = false; 3166 3167 local_irq_save(flags); 3168 3169 stock = this_cpu_ptr(&memcg_stock); 3170 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 3171 stock->nr_bytes -= nr_bytes; 3172 ret = true; 3173 } 3174 3175 local_irq_restore(flags); 3176 3177 return ret; 3178} 3179 3180static void drain_obj_stock(struct memcg_stock_pcp *stock) 3181{ 3182 struct obj_cgroup *old = stock->cached_objcg; 3183 3184 if (!old) 3185 return; 3186 3187 if (stock->nr_bytes) { 3188 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3189 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3190 3191 if (nr_pages) { 3192 struct mem_cgroup *memcg; 3193 3194 rcu_read_lock(); 3195retry: 3196 memcg = obj_cgroup_memcg(old); 3197 if (unlikely(!css_tryget(&memcg->css))) 3198 goto retry; 3199 rcu_read_unlock(); 3200 3201 __memcg_kmem_uncharge(memcg, nr_pages); 3202 css_put(&memcg->css); 3203 } 3204 3205 /* 3206 * The leftover is flushed to the centralized per-memcg value. 3207 * On the next attempt to refill obj stock it will be moved 3208 * to a per-cpu stock (probably, on an other CPU), see 3209 * refill_obj_stock(). 3210 * 3211 * How often it's flushed is a trade-off between the memory 3212 * limit enforcement accuracy and potential CPU contention, 3213 * so it might be changed in the future. 3214 */ 3215 atomic_add(nr_bytes, &old->nr_charged_bytes); 3216 stock->nr_bytes = 0; 3217 } 3218 3219 obj_cgroup_put(old); 3220 stock->cached_objcg = NULL; 3221} 3222 3223static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3224 struct mem_cgroup *root_memcg) 3225{ 3226 struct mem_cgroup *memcg; 3227 3228 if (stock->cached_objcg) { 3229 memcg = obj_cgroup_memcg(stock->cached_objcg); 3230 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3231 return true; 3232 } 3233 3234 return false; 3235} 3236 3237static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3238{ 3239 struct memcg_stock_pcp *stock; 3240 unsigned long flags; 3241 3242 local_irq_save(flags); 3243 3244 stock = this_cpu_ptr(&memcg_stock); 3245 if (stock->cached_objcg != objcg) { /* reset if necessary */ 3246 drain_obj_stock(stock); 3247 obj_cgroup_get(objcg); 3248 stock->cached_objcg = objcg; 3249 stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); 3250 } 3251 stock->nr_bytes += nr_bytes; 3252 3253 if (stock->nr_bytes > PAGE_SIZE) 3254 drain_obj_stock(stock); 3255 3256 local_irq_restore(flags); 3257} 3258 3259int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3260{ 3261 struct mem_cgroup *memcg; 3262 unsigned int nr_pages, nr_bytes; 3263 int ret; 3264 3265 if (consume_obj_stock(objcg, size)) 3266 return 0; 3267 3268 /* 3269 * In theory, memcg->nr_charged_bytes can have enough 3270 * pre-charged bytes to satisfy the allocation. However, 3271 * flushing memcg->nr_charged_bytes requires two atomic 3272 * operations, and memcg->nr_charged_bytes can't be big, 3273 * so it's better to ignore it and try grab some new pages. 3274 * memcg->nr_charged_bytes will be flushed in 3275 * refill_obj_stock(), called from this function or 3276 * independently later. 3277 */ 3278 rcu_read_lock(); 3279retry: 3280 memcg = obj_cgroup_memcg(objcg); 3281 if (unlikely(!css_tryget(&memcg->css))) 3282 goto retry; 3283 rcu_read_unlock(); 3284 3285 nr_pages = size >> PAGE_SHIFT; 3286 nr_bytes = size & (PAGE_SIZE - 1); 3287 3288 if (nr_bytes) 3289 nr_pages += 1; 3290 3291 ret = __memcg_kmem_charge(memcg, gfp, nr_pages); 3292 if (!ret && nr_bytes) 3293 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); 3294 3295 css_put(&memcg->css); 3296 return ret; 3297} 3298 3299void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3300{ 3301 refill_obj_stock(objcg, size); 3302} 3303 3304#endif /* CONFIG_MEMCG_KMEM */ 3305 3306/* 3307 * Because head->mem_cgroup is not set on tails, set it now. 3308 */ 3309void split_page_memcg(struct page *head, unsigned int nr) 3310{ 3311 struct mem_cgroup *memcg = head->mem_cgroup; 3312 int kmemcg = PageKmemcg(head); 3313 int i; 3314 3315 if (mem_cgroup_disabled() || !memcg) 3316 return; 3317 3318 for (i = 1; i < nr; i++) { 3319 head[i].mem_cgroup = memcg; 3320 if (kmemcg) 3321 __SetPageKmemcg(head + i); 3322 } 3323 css_get_many(&memcg->css, nr - 1); 3324} 3325 3326#ifdef CONFIG_MEMCG_SWAP 3327/** 3328 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3329 * @entry: swap entry to be moved 3330 * @from: mem_cgroup which the entry is moved from 3331 * @to: mem_cgroup which the entry is moved to 3332 * 3333 * It succeeds only when the swap_cgroup's record for this entry is the same 3334 * as the mem_cgroup's id of @from. 3335 * 3336 * Returns 0 on success, -EINVAL on failure. 3337 * 3338 * The caller must have charged to @to, IOW, called page_counter_charge() about 3339 * both res and memsw, and called css_get(). 3340 */ 3341static int mem_cgroup_move_swap_account(swp_entry_t entry, 3342 struct mem_cgroup *from, struct mem_cgroup *to) 3343{ 3344 unsigned short old_id, new_id; 3345 3346 old_id = mem_cgroup_id(from); 3347 new_id = mem_cgroup_id(to); 3348 3349 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3350 mod_memcg_state(from, MEMCG_SWAP, -1); 3351 mod_memcg_state(to, MEMCG_SWAP, 1); 3352 return 0; 3353 } 3354 return -EINVAL; 3355} 3356#else 3357static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3358 struct mem_cgroup *from, struct mem_cgroup *to) 3359{ 3360 return -EINVAL; 3361} 3362#endif 3363 3364static DEFINE_MUTEX(memcg_max_mutex); 3365 3366static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3367 unsigned long max, bool memsw) 3368{ 3369 bool enlarge = false; 3370 bool drained = false; 3371 int ret; 3372 bool limits_invariant; 3373 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3374 3375 do { 3376 if (signal_pending(current)) { 3377 ret = -EINTR; 3378 break; 3379 } 3380 3381 mutex_lock(&memcg_max_mutex); 3382 /* 3383 * Make sure that the new limit (memsw or memory limit) doesn't 3384 * break our basic invariant rule memory.max <= memsw.max. 3385 */ 3386 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3387 max <= memcg->memsw.max; 3388 if (!limits_invariant) { 3389 mutex_unlock(&memcg_max_mutex); 3390 ret = -EINVAL; 3391 break; 3392 } 3393 if (max > counter->max) 3394 enlarge = true; 3395 ret = page_counter_set_max(counter, max); 3396 mutex_unlock(&memcg_max_mutex); 3397 3398 if (!ret) 3399 break; 3400 3401 if (!drained) { 3402 drain_all_stock(memcg); 3403 drained = true; 3404 continue; 3405 } 3406 3407 if (!try_to_free_mem_cgroup_pages(memcg, 1, 3408 GFP_KERNEL, !memsw)) { 3409 ret = -EBUSY; 3410 break; 3411 } 3412 } while (true); 3413 3414 if (!ret && enlarge) 3415 memcg_oom_recover(memcg); 3416 3417 return ret; 3418} 3419 3420unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3421 gfp_t gfp_mask, 3422 unsigned long *total_scanned) 3423{ 3424 unsigned long nr_reclaimed = 0; 3425 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3426 unsigned long reclaimed; 3427 int loop = 0; 3428 struct mem_cgroup_tree_per_node *mctz; 3429 unsigned long excess; 3430 unsigned long nr_scanned; 3431 3432 if (order > 0) 3433 return 0; 3434 3435 mctz = soft_limit_tree_node(pgdat->node_id); 3436 3437 /* 3438 * Do not even bother to check the largest node if the root 3439 * is empty. Do it lockless to prevent lock bouncing. Races 3440 * are acceptable as soft limit is best effort anyway. 3441 */ 3442 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3443 return 0; 3444 3445 /* 3446 * This loop can run a while, specially if mem_cgroup's continuously 3447 * keep exceeding their soft limit and putting the system under 3448 * pressure 3449 */ 3450 do { 3451 if (next_mz) 3452 mz = next_mz; 3453 else 3454 mz = mem_cgroup_largest_soft_limit_node(mctz); 3455 if (!mz) 3456 break; 3457 3458 nr_scanned = 0; 3459 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3460 gfp_mask, &nr_scanned); 3461 nr_reclaimed += reclaimed; 3462 *total_scanned += nr_scanned; 3463 spin_lock_irq(&mctz->lock); 3464 __mem_cgroup_remove_exceeded(mz, mctz); 3465 3466 /* 3467 * If we failed to reclaim anything from this memory cgroup 3468 * it is time to move on to the next cgroup 3469 */ 3470 next_mz = NULL; 3471 if (!reclaimed) 3472 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3473 3474 excess = soft_limit_excess(mz->memcg); 3475 /* 3476 * One school of thought says that we should not add 3477 * back the node to the tree if reclaim returns 0. 3478 * But our reclaim could return 0, simply because due 3479 * to priority we are exposing a smaller subset of 3480 * memory to reclaim from. Consider this as a longer 3481 * term TODO. 3482 */ 3483 /* If excess == 0, no tree ops */ 3484 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3485 spin_unlock_irq(&mctz->lock); 3486 css_put(&mz->memcg->css); 3487 loop++; 3488 /* 3489 * Could not reclaim anything and there are no more 3490 * mem cgroups to try or we seem to be looping without 3491 * reclaiming anything. 3492 */ 3493 if (!nr_reclaimed && 3494 (next_mz == NULL || 3495 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3496 break; 3497 } while (!nr_reclaimed); 3498 if (next_mz) 3499 css_put(&next_mz->memcg->css); 3500 return nr_reclaimed; 3501} 3502 3503/* 3504 * Test whether @memcg has children, dead or alive. Note that this 3505 * function doesn't care whether @memcg has use_hierarchy enabled and 3506 * returns %true if there are child csses according to the cgroup 3507 * hierarchy. Testing use_hierarchy is the caller's responsibility. 3508 */ 3509static inline bool memcg_has_children(struct mem_cgroup *memcg) 3510{ 3511 bool ret; 3512 3513 rcu_read_lock(); 3514 ret = css_next_child(NULL, &memcg->css); 3515 rcu_read_unlock(); 3516 return ret; 3517} 3518 3519/* 3520 * Reclaims as many pages from the given memcg as possible. 3521 * 3522 * Caller is responsible for holding css reference for memcg. 3523 */ 3524static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3525{ 3526 int nr_retries = MAX_RECLAIM_RETRIES; 3527 3528 /* we call try-to-free pages for make this cgroup empty */ 3529 lru_add_drain_all(); 3530 3531 drain_all_stock(memcg); 3532 3533 /* try to free all pages in this cgroup */ 3534 while (nr_retries && page_counter_read(&memcg->memory)) { 3535 int progress; 3536 3537 if (signal_pending(current)) 3538 return -EINTR; 3539 3540 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3541 GFP_KERNEL, true); 3542 if (!progress) { 3543 nr_retries--; 3544 /* maybe some writeback is necessary */ 3545 congestion_wait(BLK_RW_ASYNC, HZ/10); 3546 } 3547 3548 } 3549 3550 return 0; 3551} 3552 3553static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3554 char *buf, size_t nbytes, 3555 loff_t off) 3556{ 3557 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3558 3559 if (mem_cgroup_is_root(memcg)) 3560 return -EINVAL; 3561 return mem_cgroup_force_empty(memcg) ?: nbytes; 3562} 3563 3564static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3565 struct cftype *cft) 3566{ 3567 return mem_cgroup_from_css(css)->use_hierarchy; 3568} 3569 3570static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3571 struct cftype *cft, u64 val) 3572{ 3573 int retval = 0; 3574 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3575 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3576 3577 if (memcg->use_hierarchy == val) 3578 return 0; 3579 3580 /* 3581 * If parent's use_hierarchy is set, we can't make any modifications 3582 * in the child subtrees. If it is unset, then the change can 3583 * occur, provided the current cgroup has no children. 3584 * 3585 * For the root cgroup, parent_mem is NULL, we allow value to be 3586 * set if there are no children. 3587 */ 3588 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3589 (val == 1 || val == 0)) { 3590 if (!memcg_has_children(memcg)) 3591 memcg->use_hierarchy = val; 3592 else 3593 retval = -EBUSY; 3594 } else 3595 retval = -EINVAL; 3596 3597 return retval; 3598} 3599 3600static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3601{ 3602 unsigned long val; 3603 3604 if (mem_cgroup_is_root(memcg)) { 3605 val = memcg_page_state(memcg, NR_FILE_PAGES) + 3606 memcg_page_state(memcg, NR_ANON_MAPPED); 3607 if (swap) 3608 val += memcg_page_state(memcg, MEMCG_SWAP); 3609 } else { 3610 if (!swap) 3611 val = page_counter_read(&memcg->memory); 3612 else 3613 val = page_counter_read(&memcg->memsw); 3614 } 3615 return val; 3616} 3617 3618enum { 3619 RES_USAGE, 3620 RES_LIMIT, 3621 RES_MAX_USAGE, 3622 RES_FAILCNT, 3623 RES_SOFT_LIMIT, 3624}; 3625 3626static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3627 struct cftype *cft) 3628{ 3629 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3630 struct page_counter *counter; 3631 3632 switch (MEMFILE_TYPE(cft->private)) { 3633 case _MEM: 3634 counter = &memcg->memory; 3635 break; 3636 case _MEMSWAP: 3637 counter = &memcg->memsw; 3638 break; 3639 case _KMEM: 3640 counter = &memcg->kmem; 3641 break; 3642 case _TCP: 3643 counter = &memcg->tcpmem; 3644 break; 3645 default: 3646 BUG(); 3647 } 3648 3649 switch (MEMFILE_ATTR(cft->private)) { 3650 case RES_USAGE: 3651 if (counter == &memcg->memory) 3652 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3653 if (counter == &memcg->memsw) 3654 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3655 return (u64)page_counter_read(counter) * PAGE_SIZE; 3656 case RES_LIMIT: 3657 return (u64)counter->max * PAGE_SIZE; 3658 case RES_MAX_USAGE: 3659 return (u64)counter->watermark * PAGE_SIZE; 3660 case RES_FAILCNT: 3661 return counter->failcnt; 3662 case RES_SOFT_LIMIT: 3663 return (u64)memcg->soft_limit * PAGE_SIZE; 3664 default: 3665 BUG(); 3666 } 3667} 3668 3669static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) 3670{ 3671 unsigned long stat[MEMCG_NR_STAT] = {0}; 3672 struct mem_cgroup *mi; 3673 int node, cpu, i; 3674 3675 for_each_online_cpu(cpu) 3676 for (i = 0; i < MEMCG_NR_STAT; i++) 3677 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); 3678 3679 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3680 for (i = 0; i < MEMCG_NR_STAT; i++) 3681 atomic_long_add(stat[i], &mi->vmstats[i]); 3682 3683 for_each_node(node) { 3684 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 3685 struct mem_cgroup_per_node *pi; 3686 3687 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3688 stat[i] = 0; 3689 3690 for_each_online_cpu(cpu) 3691 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3692 stat[i] += per_cpu( 3693 pn->lruvec_stat_cpu->count[i], cpu); 3694 3695 for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) 3696 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3697 atomic_long_add(stat[i], &pi->lruvec_stat[i]); 3698 } 3699} 3700 3701static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) 3702{ 3703 unsigned long events[NR_VM_EVENT_ITEMS]; 3704 struct mem_cgroup *mi; 3705 int cpu, i; 3706 3707 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3708 events[i] = 0; 3709 3710 for_each_online_cpu(cpu) 3711 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3712 events[i] += per_cpu(memcg->vmstats_percpu->events[i], 3713 cpu); 3714 3715 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3716 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3717 atomic_long_add(events[i], &mi->vmevents[i]); 3718} 3719 3720#ifdef CONFIG_MEMCG_KMEM 3721static int memcg_online_kmem(struct mem_cgroup *memcg) 3722{ 3723 struct obj_cgroup *objcg; 3724 int memcg_id; 3725 3726 if (cgroup_memory_nokmem) 3727 return 0; 3728 3729 BUG_ON(memcg->kmemcg_id >= 0); 3730 BUG_ON(memcg->kmem_state); 3731 3732 memcg_id = memcg_alloc_cache_id(); 3733 if (memcg_id < 0) 3734 return memcg_id; 3735 3736 objcg = obj_cgroup_alloc(); 3737 if (!objcg) { 3738 memcg_free_cache_id(memcg_id); 3739 return -ENOMEM; 3740 } 3741 objcg->memcg = memcg; 3742 rcu_assign_pointer(memcg->objcg, objcg); 3743 3744 static_branch_enable(&memcg_kmem_enabled_key); 3745 3746 /* 3747 * A memory cgroup is considered kmem-online as soon as it gets 3748 * kmemcg_id. Setting the id after enabling static branching will 3749 * guarantee no one starts accounting before all call sites are 3750 * patched. 3751 */ 3752 memcg->kmemcg_id = memcg_id; 3753 memcg->kmem_state = KMEM_ONLINE; 3754 3755 return 0; 3756} 3757 3758static void memcg_offline_kmem(struct mem_cgroup *memcg) 3759{ 3760 struct cgroup_subsys_state *css; 3761 struct mem_cgroup *parent, *child; 3762 int kmemcg_id; 3763 3764 if (memcg->kmem_state != KMEM_ONLINE) 3765 return; 3766 3767 memcg->kmem_state = KMEM_ALLOCATED; 3768 3769 parent = parent_mem_cgroup(memcg); 3770 if (!parent) 3771 parent = root_mem_cgroup; 3772 3773 memcg_reparent_objcgs(memcg, parent); 3774 3775 kmemcg_id = memcg->kmemcg_id; 3776 BUG_ON(kmemcg_id < 0); 3777 3778 /* 3779 * Change kmemcg_id of this cgroup and all its descendants to the 3780 * parent's id, and then move all entries from this cgroup's list_lrus 3781 * to ones of the parent. After we have finished, all list_lrus 3782 * corresponding to this cgroup are guaranteed to remain empty. The 3783 * ordering is imposed by list_lru_node->lock taken by 3784 * memcg_drain_all_list_lrus(). 3785 */ 3786 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 3787 css_for_each_descendant_pre(css, &memcg->css) { 3788 child = mem_cgroup_from_css(css); 3789 BUG_ON(child->kmemcg_id != kmemcg_id); 3790 child->kmemcg_id = parent->kmemcg_id; 3791 if (!memcg->use_hierarchy) 3792 break; 3793 } 3794 rcu_read_unlock(); 3795 3796 memcg_drain_all_list_lrus(kmemcg_id, parent); 3797 3798 memcg_free_cache_id(kmemcg_id); 3799} 3800 3801static void memcg_free_kmem(struct mem_cgroup *memcg) 3802{ 3803 /* css_alloc() failed, offlining didn't happen */ 3804 if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 3805 memcg_offline_kmem(memcg); 3806} 3807#else 3808static int memcg_online_kmem(struct mem_cgroup *memcg) 3809{ 3810 return 0; 3811} 3812static void memcg_offline_kmem(struct mem_cgroup *memcg) 3813{ 3814} 3815static void memcg_free_kmem(struct mem_cgroup *memcg) 3816{ 3817} 3818#endif /* CONFIG_MEMCG_KMEM */ 3819 3820static int memcg_update_kmem_max(struct mem_cgroup *memcg, 3821 unsigned long max) 3822{ 3823 int ret; 3824 3825 mutex_lock(&memcg_max_mutex); 3826 ret = page_counter_set_max(&memcg->kmem, max); 3827 mutex_unlock(&memcg_max_mutex); 3828 return ret; 3829} 3830 3831static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3832{ 3833 int ret; 3834 3835 mutex_lock(&memcg_max_mutex); 3836 3837 ret = page_counter_set_max(&memcg->tcpmem, max); 3838 if (ret) 3839 goto out; 3840 3841 if (!memcg->tcpmem_active) { 3842 /* 3843 * The active flag needs to be written after the static_key 3844 * update. This is what guarantees that the socket activation 3845 * function is the last one to run. See mem_cgroup_sk_alloc() 3846 * for details, and note that we don't mark any socket as 3847 * belonging to this memcg until that flag is up. 3848 * 3849 * We need to do this, because static_keys will span multiple 3850 * sites, but we can't control their order. If we mark a socket 3851 * as accounted, but the accounting functions are not patched in 3852 * yet, we'll lose accounting. 3853 * 3854 * We never race with the readers in mem_cgroup_sk_alloc(), 3855 * because when this value change, the code to process it is not 3856 * patched in yet. 3857 */ 3858 static_branch_inc(&memcg_sockets_enabled_key); 3859 memcg->tcpmem_active = true; 3860 } 3861out: 3862 mutex_unlock(&memcg_max_mutex); 3863 return ret; 3864} 3865 3866/* 3867 * The user of this function is... 3868 * RES_LIMIT. 3869 */ 3870static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3871 char *buf, size_t nbytes, loff_t off) 3872{ 3873 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3874 unsigned long nr_pages; 3875 int ret; 3876 3877 buf = strstrip(buf); 3878 ret = page_counter_memparse(buf, "-1", &nr_pages); 3879 if (ret) 3880 return ret; 3881 3882 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3883 case RES_LIMIT: 3884 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3885 ret = -EINVAL; 3886 break; 3887 } 3888 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3889 case _MEM: 3890 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3891 break; 3892 case _MEMSWAP: 3893 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3894 break; 3895 case _KMEM: 3896 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 3897 "Please report your usecase to linux-mm@kvack.org if you " 3898 "depend on this functionality.\n"); 3899 ret = memcg_update_kmem_max(memcg, nr_pages); 3900 break; 3901 case _TCP: 3902 ret = memcg_update_tcp_max(memcg, nr_pages); 3903 break; 3904 } 3905 break; 3906 case RES_SOFT_LIMIT: 3907 memcg->soft_limit = nr_pages; 3908 ret = 0; 3909 break; 3910 } 3911 return ret ?: nbytes; 3912} 3913 3914static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3915 size_t nbytes, loff_t off) 3916{ 3917 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3918 struct page_counter *counter; 3919 3920 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3921 case _MEM: 3922 counter = &memcg->memory; 3923 break; 3924 case _MEMSWAP: 3925 counter = &memcg->memsw; 3926 break; 3927 case _KMEM: 3928 counter = &memcg->kmem; 3929 break; 3930 case _TCP: 3931 counter = &memcg->tcpmem; 3932 break; 3933 default: 3934 BUG(); 3935 } 3936 3937 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3938 case RES_MAX_USAGE: 3939 page_counter_reset_watermark(counter); 3940 break; 3941 case RES_FAILCNT: 3942 counter->failcnt = 0; 3943 break; 3944 default: 3945 BUG(); 3946 } 3947 3948 return nbytes; 3949} 3950 3951static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3952 struct cftype *cft) 3953{ 3954 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3955} 3956 3957#ifdef CONFIG_MMU 3958static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3959 struct cftype *cft, u64 val) 3960{ 3961 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3962 3963 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 3964 "Please report your usecase to linux-mm@kvack.org if you " 3965 "depend on this functionality.\n"); 3966 3967 if (val & ~MOVE_MASK) 3968 return -EINVAL; 3969 3970 /* 3971 * No kind of locking is needed in here, because ->can_attach() will 3972 * check this value once in the beginning of the process, and then carry 3973 * on with stale data. This means that changes to this value will only 3974 * affect task migrations starting after the change. 3975 */ 3976 memcg->move_charge_at_immigrate = val; 3977 return 0; 3978} 3979#else 3980static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3981 struct cftype *cft, u64 val) 3982{ 3983 return -ENOSYS; 3984} 3985#endif 3986 3987#ifdef CONFIG_NUMA 3988 3989#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3990#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3991#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3992 3993static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3994 int nid, unsigned int lru_mask, bool tree) 3995{ 3996 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3997 unsigned long nr = 0; 3998 enum lru_list lru; 3999 4000 VM_BUG_ON((unsigned)nid >= nr_node_ids); 4001 4002 for_each_lru(lru) { 4003 if (!(BIT(lru) & lru_mask)) 4004 continue; 4005 if (tree) 4006 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 4007 else 4008 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 4009 } 4010 return nr; 4011} 4012 4013static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 4014 unsigned int lru_mask, 4015 bool tree) 4016{ 4017 unsigned long nr = 0; 4018 enum lru_list lru; 4019 4020 for_each_lru(lru) { 4021 if (!(BIT(lru) & lru_mask)) 4022 continue; 4023 if (tree) 4024 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 4025 else 4026 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 4027 } 4028 return nr; 4029} 4030 4031static int memcg_numa_stat_show(struct seq_file *m, void *v) 4032{ 4033 struct numa_stat { 4034 const char *name; 4035 unsigned int lru_mask; 4036 }; 4037 4038 static const struct numa_stat stats[] = { 4039 { "total", LRU_ALL }, 4040 { "file", LRU_ALL_FILE }, 4041 { "anon", LRU_ALL_ANON }, 4042 { "unevictable", BIT(LRU_UNEVICTABLE) }, 4043 }; 4044 const struct numa_stat *stat; 4045 int nid; 4046 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4047 4048 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4049 seq_printf(m, "%s=%lu", stat->name, 4050 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4051 false)); 4052 for_each_node_state(nid, N_MEMORY) 4053 seq_printf(m, " N%d=%lu", nid, 4054 mem_cgroup_node_nr_lru_pages(memcg, nid, 4055 stat->lru_mask, false)); 4056 seq_putc(m, '\n'); 4057 } 4058 4059 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4060 4061 seq_printf(m, "hierarchical_%s=%lu", stat->name, 4062 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4063 true)); 4064 for_each_node_state(nid, N_MEMORY) 4065 seq_printf(m, " N%d=%lu", nid, 4066 mem_cgroup_node_nr_lru_pages(memcg, nid, 4067 stat->lru_mask, true)); 4068 seq_putc(m, '\n'); 4069 } 4070 4071 return 0; 4072} 4073#endif /* CONFIG_NUMA */ 4074 4075static const unsigned int memcg1_stats[] = { 4076 NR_FILE_PAGES, 4077 NR_ANON_MAPPED, 4078#ifdef CONFIG_TRANSPARENT_HUGEPAGE 4079 NR_ANON_THPS, 4080#endif 4081 NR_SHMEM, 4082 NR_FILE_MAPPED, 4083 NR_FILE_DIRTY, 4084 NR_WRITEBACK, 4085 MEMCG_SWAP, 4086}; 4087 4088static const char *const memcg1_stat_names[] = { 4089 "cache", 4090 "rss", 4091#ifdef CONFIG_TRANSPARENT_HUGEPAGE 4092 "rss_huge", 4093#endif 4094 "shmem", 4095 "mapped_file", 4096 "dirty", 4097 "writeback", 4098 "swap", 4099}; 4100 4101/* Universal VM events cgroup1 shows, original sort order */ 4102static const unsigned int memcg1_events[] = { 4103 PGPGIN, 4104 PGPGOUT, 4105 PGFAULT, 4106 PGMAJFAULT, 4107}; 4108 4109static int memcg_stat_show(struct seq_file *m, void *v) 4110{ 4111 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4112 unsigned long memory, memsw; 4113 struct mem_cgroup *mi; 4114 unsigned int i; 4115 4116 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 4117 4118 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4119 unsigned long nr; 4120 4121 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4122 continue; 4123 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 4124#ifdef CONFIG_TRANSPARENT_HUGEPAGE 4125 if (memcg1_stats[i] == NR_ANON_THPS) 4126 nr *= HPAGE_PMD_NR; 4127#endif 4128 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); 4129 } 4130 4131 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4132 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 4133 memcg_events_local(memcg, memcg1_events[i])); 4134 4135 for (i = 0; i < NR_LRU_LISTS; i++) { 4136#ifdef CONFIG_MEM_PURGEABLE 4137 if (i == LRU_INACTIVE_PURGEABLE || i == LRU_ACTIVE_PURGEABLE) 4138 continue; 4139#endif 4140 seq_printf(m, "%s %lu\n", lru_list_name(i), 4141 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 4142 PAGE_SIZE); 4143 } 4144 4145 /* Hierarchical information */ 4146 memory = memsw = PAGE_COUNTER_MAX; 4147 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4148 memory = min(memory, READ_ONCE(mi->memory.max)); 4149 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 4150 } 4151 seq_printf(m, "hierarchical_memory_limit %llu\n", 4152 (u64)memory * PAGE_SIZE); 4153 if (do_memsw_account()) 4154 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4155 (u64)memsw * PAGE_SIZE); 4156 4157 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4158 unsigned long nr; 4159 4160 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4161 continue; 4162 nr = memcg_page_state(memcg, memcg1_stats[i]); 4163#ifdef CONFIG_TRANSPARENT_HUGEPAGE 4164 if (memcg1_stats[i] == NR_ANON_THPS) 4165 nr *= HPAGE_PMD_NR; 4166#endif 4167 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 4168 (u64)nr * PAGE_SIZE); 4169 } 4170 4171 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4172 seq_printf(m, "total_%s %llu\n", 4173 vm_event_name(memcg1_events[i]), 4174 (u64)memcg_events(memcg, memcg1_events[i])); 4175 4176 for (i = 0; i < NR_LRU_LISTS; i++) { 4177#ifdef CONFIG_MEM_PURGEABLE 4178 if (i == LRU_INACTIVE_PURGEABLE || i == LRU_ACTIVE_PURGEABLE) 4179 continue; 4180#endif 4181 seq_printf(m, "total_%s %llu\n", lru_list_name(i), 4182 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4183 PAGE_SIZE); 4184 } 4185 4186#ifdef CONFIG_DEBUG_VM 4187 { 4188 pg_data_t *pgdat; 4189 struct mem_cgroup_per_node *mz; 4190 unsigned long anon_cost = 0; 4191 unsigned long file_cost = 0; 4192 4193 for_each_online_pgdat(pgdat) { 4194 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); 4195 4196 anon_cost += mz->lruvec.anon_cost; 4197 file_cost += mz->lruvec.file_cost; 4198 } 4199 seq_printf(m, "anon_cost %lu\n", anon_cost); 4200 seq_printf(m, "file_cost %lu\n", file_cost); 4201 } 4202#endif 4203 4204#ifdef CONFIG_HYPERHOLD_DEBUG 4205 memcg_eswap_info_show(m); 4206#endif 4207 return 0; 4208} 4209 4210static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4211 struct cftype *cft) 4212{ 4213 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4214 4215 return mem_cgroup_swappiness(memcg); 4216} 4217 4218static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4219 struct cftype *cft, u64 val) 4220{ 4221 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4222 4223 if (val > 200) 4224 return -EINVAL; 4225 4226 if (css->parent) 4227 memcg->swappiness = val; 4228 else 4229 vm_swappiness = val; 4230 4231 return 0; 4232} 4233 4234static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4235{ 4236 struct mem_cgroup_threshold_ary *t; 4237 unsigned long usage; 4238 int i; 4239 4240 rcu_read_lock(); 4241 if (!swap) 4242 t = rcu_dereference(memcg->thresholds.primary); 4243 else 4244 t = rcu_dereference(memcg->memsw_thresholds.primary); 4245 4246 if (!t) 4247 goto unlock; 4248 4249 usage = mem_cgroup_usage(memcg, swap); 4250 4251 /* 4252 * current_threshold points to threshold just below or equal to usage. 4253 * If it's not true, a threshold was crossed after last 4254 * call of __mem_cgroup_threshold(). 4255 */ 4256 i = t->current_threshold; 4257 4258 /* 4259 * Iterate backward over array of thresholds starting from 4260 * current_threshold and check if a threshold is crossed. 4261 * If none of thresholds below usage is crossed, we read 4262 * only one element of the array here. 4263 */ 4264 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4265 eventfd_signal(t->entries[i].eventfd, 1); 4266 4267 /* i = current_threshold + 1 */ 4268 i++; 4269 4270 /* 4271 * Iterate forward over array of thresholds starting from 4272 * current_threshold+1 and check if a threshold is crossed. 4273 * If none of thresholds above usage is crossed, we read 4274 * only one element of the array here. 4275 */ 4276 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4277 eventfd_signal(t->entries[i].eventfd, 1); 4278 4279 /* Update current_threshold */ 4280 t->current_threshold = i - 1; 4281unlock: 4282 rcu_read_unlock(); 4283} 4284 4285static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4286{ 4287 while (memcg) { 4288 __mem_cgroup_threshold(memcg, false); 4289 if (do_memsw_account()) 4290 __mem_cgroup_threshold(memcg, true); 4291 4292 memcg = parent_mem_cgroup(memcg); 4293 } 4294} 4295 4296static int compare_thresholds(const void *a, const void *b) 4297{ 4298 const struct mem_cgroup_threshold *_a = a; 4299 const struct mem_cgroup_threshold *_b = b; 4300 4301 if (_a->threshold > _b->threshold) 4302 return 1; 4303 4304 if (_a->threshold < _b->threshold) 4305 return -1; 4306 4307 return 0; 4308} 4309 4310static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4311{ 4312 struct mem_cgroup_eventfd_list *ev; 4313 4314 spin_lock(&memcg_oom_lock); 4315 4316 list_for_each_entry(ev, &memcg->oom_notify, list) 4317 eventfd_signal(ev->eventfd, 1); 4318 4319 spin_unlock(&memcg_oom_lock); 4320 return 0; 4321} 4322 4323static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4324{ 4325 struct mem_cgroup *iter; 4326 4327 for_each_mem_cgroup_tree(iter, memcg) 4328 mem_cgroup_oom_notify_cb(iter); 4329} 4330 4331static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4332 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4333{ 4334 struct mem_cgroup_thresholds *thresholds; 4335 struct mem_cgroup_threshold_ary *new; 4336 unsigned long threshold; 4337 unsigned long usage; 4338 int i, size, ret; 4339 4340 ret = page_counter_memparse(args, "-1", &threshold); 4341 if (ret) 4342 return ret; 4343 4344 mutex_lock(&memcg->thresholds_lock); 4345 4346 if (type == _MEM) { 4347 thresholds = &memcg->thresholds; 4348 usage = mem_cgroup_usage(memcg, false); 4349 } else if (type == _MEMSWAP) { 4350 thresholds = &memcg->memsw_thresholds; 4351 usage = mem_cgroup_usage(memcg, true); 4352 } else 4353 BUG(); 4354 4355 /* Check if a threshold crossed before adding a new one */ 4356 if (thresholds->primary) 4357 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4358 4359 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4360 4361 /* Allocate memory for new array of thresholds */ 4362 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4363 if (!new) { 4364 ret = -ENOMEM; 4365 goto unlock; 4366 } 4367 new->size = size; 4368 4369 /* Copy thresholds (if any) to new array */ 4370 if (thresholds->primary) 4371 memcpy(new->entries, thresholds->primary->entries, 4372 flex_array_size(new, entries, size - 1)); 4373 4374 /* Add new threshold */ 4375 new->entries[size - 1].eventfd = eventfd; 4376 new->entries[size - 1].threshold = threshold; 4377 4378 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4379 sort(new->entries, size, sizeof(*new->entries), 4380 compare_thresholds, NULL); 4381 4382 /* Find current threshold */ 4383 new->current_threshold = -1; 4384 for (i = 0; i < size; i++) { 4385 if (new->entries[i].threshold <= usage) { 4386 /* 4387 * new->current_threshold will not be used until 4388 * rcu_assign_pointer(), so it's safe to increment 4389 * it here. 4390 */ 4391 ++new->current_threshold; 4392 } else 4393 break; 4394 } 4395 4396 /* Free old spare buffer and save old primary buffer as spare */ 4397 kfree(thresholds->spare); 4398 thresholds->spare = thresholds->primary; 4399 4400 rcu_assign_pointer(thresholds->primary, new); 4401 4402 /* To be sure that nobody uses thresholds */ 4403 synchronize_rcu(); 4404 4405unlock: 4406 mutex_unlock(&memcg->thresholds_lock); 4407 4408 return ret; 4409} 4410 4411static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4412 struct eventfd_ctx *eventfd, const char *args) 4413{ 4414 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4415} 4416 4417static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4418 struct eventfd_ctx *eventfd, const char *args) 4419{ 4420 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4421} 4422 4423static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4424 struct eventfd_ctx *eventfd, enum res_type type) 4425{ 4426 struct mem_cgroup_thresholds *thresholds; 4427 struct mem_cgroup_threshold_ary *new; 4428 unsigned long usage; 4429 int i, j, size, entries; 4430 4431 mutex_lock(&memcg->thresholds_lock); 4432 4433 if (type == _MEM) { 4434 thresholds = &memcg->thresholds; 4435 usage = mem_cgroup_usage(memcg, false); 4436 } else if (type == _MEMSWAP) { 4437 thresholds = &memcg->memsw_thresholds; 4438 usage = mem_cgroup_usage(memcg, true); 4439 } else 4440 BUG(); 4441 4442 if (!thresholds->primary) 4443 goto unlock; 4444 4445 /* Check if a threshold crossed before removing */ 4446 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4447 4448 /* Calculate new number of threshold */ 4449 size = entries = 0; 4450 for (i = 0; i < thresholds->primary->size; i++) { 4451 if (thresholds->primary->entries[i].eventfd != eventfd) 4452 size++; 4453 else 4454 entries++; 4455 } 4456 4457 new = thresholds->spare; 4458 4459 /* If no items related to eventfd have been cleared, nothing to do */ 4460 if (!entries) 4461 goto unlock; 4462 4463 /* Set thresholds array to NULL if we don't have thresholds */ 4464 if (!size) { 4465 kfree(new); 4466 new = NULL; 4467 goto swap_buffers; 4468 } 4469 4470 new->size = size; 4471 4472 /* Copy thresholds and find current threshold */ 4473 new->current_threshold = -1; 4474 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4475 if (thresholds->primary->entries[i].eventfd == eventfd) 4476 continue; 4477 4478 new->entries[j] = thresholds->primary->entries[i]; 4479 if (new->entries[j].threshold <= usage) { 4480 /* 4481 * new->current_threshold will not be used 4482 * until rcu_assign_pointer(), so it's safe to increment 4483 * it here. 4484 */ 4485 ++new->current_threshold; 4486 } 4487 j++; 4488 } 4489 4490swap_buffers: 4491 /* Swap primary and spare array */ 4492 thresholds->spare = thresholds->primary; 4493 4494 rcu_assign_pointer(thresholds->primary, new); 4495 4496 /* To be sure that nobody uses thresholds */ 4497 synchronize_rcu(); 4498 4499 /* If all events are unregistered, free the spare array */ 4500 if (!new) { 4501 kfree(thresholds->spare); 4502 thresholds->spare = NULL; 4503 } 4504unlock: 4505 mutex_unlock(&memcg->thresholds_lock); 4506} 4507 4508static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4509 struct eventfd_ctx *eventfd) 4510{ 4511 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4512} 4513 4514static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4515 struct eventfd_ctx *eventfd) 4516{ 4517 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4518} 4519 4520static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4521 struct eventfd_ctx *eventfd, const char *args) 4522{ 4523 struct mem_cgroup_eventfd_list *event; 4524 4525 event = kmalloc(sizeof(*event), GFP_KERNEL); 4526 if (!event) 4527 return -ENOMEM; 4528 4529 spin_lock(&memcg_oom_lock); 4530 4531 event->eventfd = eventfd; 4532 list_add(&event->list, &memcg->oom_notify); 4533 4534 /* already in OOM ? */ 4535 if (memcg->under_oom) 4536 eventfd_signal(eventfd, 1); 4537 spin_unlock(&memcg_oom_lock); 4538 4539 return 0; 4540} 4541 4542static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4543 struct eventfd_ctx *eventfd) 4544{ 4545 struct mem_cgroup_eventfd_list *ev, *tmp; 4546 4547 spin_lock(&memcg_oom_lock); 4548 4549 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4550 if (ev->eventfd == eventfd) { 4551 list_del(&ev->list); 4552 kfree(ev); 4553 } 4554 } 4555 4556 spin_unlock(&memcg_oom_lock); 4557} 4558 4559static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4560{ 4561 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4562 4563 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4564 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4565 seq_printf(sf, "oom_kill %lu\n", 4566 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4567 return 0; 4568} 4569 4570static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4571 struct cftype *cft, u64 val) 4572{ 4573 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4574 4575 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4576 if (!css->parent || !((val == 0) || (val == 1))) 4577 return -EINVAL; 4578 4579 memcg->oom_kill_disable = val; 4580 if (!val) 4581 memcg_oom_recover(memcg); 4582 4583 return 0; 4584} 4585 4586#ifdef CONFIG_CGROUP_WRITEBACK 4587 4588#include <trace/events/writeback.h> 4589 4590static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4591{ 4592 return wb_domain_init(&memcg->cgwb_domain, gfp); 4593} 4594 4595static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4596{ 4597 wb_domain_exit(&memcg->cgwb_domain); 4598} 4599 4600static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4601{ 4602 wb_domain_size_changed(&memcg->cgwb_domain); 4603} 4604 4605struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4606{ 4607 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4608 4609 if (!memcg->css.parent) 4610 return NULL; 4611 4612 return &memcg->cgwb_domain; 4613} 4614 4615/* 4616 * idx can be of type enum memcg_stat_item or node_stat_item. 4617 * Keep in sync with memcg_exact_page(). 4618 */ 4619static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) 4620{ 4621 long x = atomic_long_read(&memcg->vmstats[idx]); 4622 int cpu; 4623 4624 for_each_online_cpu(cpu) 4625 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; 4626 if (x < 0) 4627 x = 0; 4628 return x; 4629} 4630 4631/** 4632 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4633 * @wb: bdi_writeback in question 4634 * @pfilepages: out parameter for number of file pages 4635 * @pheadroom: out parameter for number of allocatable pages according to memcg 4636 * @pdirty: out parameter for number of dirty pages 4637 * @pwriteback: out parameter for number of pages under writeback 4638 * 4639 * Determine the numbers of file, headroom, dirty, and writeback pages in 4640 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4641 * is a bit more involved. 4642 * 4643 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4644 * headroom is calculated as the lowest headroom of itself and the 4645 * ancestors. Note that this doesn't consider the actual amount of 4646 * available memory in the system. The caller should further cap 4647 * *@pheadroom accordingly. 4648 */ 4649void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4650 unsigned long *pheadroom, unsigned long *pdirty, 4651 unsigned long *pwriteback) 4652{ 4653 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4654 struct mem_cgroup *parent; 4655 4656 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); 4657 4658 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); 4659 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + 4660 memcg_exact_page_state(memcg, NR_ACTIVE_FILE); 4661 *pheadroom = PAGE_COUNTER_MAX; 4662 4663 while ((parent = parent_mem_cgroup(memcg))) { 4664 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4665 READ_ONCE(memcg->memory.high)); 4666 unsigned long used = page_counter_read(&memcg->memory); 4667 4668 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4669 memcg = parent; 4670 } 4671} 4672 4673/* 4674 * Foreign dirty flushing 4675 * 4676 * There's an inherent mismatch between memcg and writeback. The former 4677 * trackes ownership per-page while the latter per-inode. This was a 4678 * deliberate design decision because honoring per-page ownership in the 4679 * writeback path is complicated, may lead to higher CPU and IO overheads 4680 * and deemed unnecessary given that write-sharing an inode across 4681 * different cgroups isn't a common use-case. 4682 * 4683 * Combined with inode majority-writer ownership switching, this works well 4684 * enough in most cases but there are some pathological cases. For 4685 * example, let's say there are two cgroups A and B which keep writing to 4686 * different but confined parts of the same inode. B owns the inode and 4687 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4688 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4689 * triggering background writeback. A will be slowed down without a way to 4690 * make writeback of the dirty pages happen. 4691 * 4692 * Conditions like the above can lead to a cgroup getting repatedly and 4693 * severely throttled after making some progress after each 4694 * dirty_expire_interval while the underyling IO device is almost 4695 * completely idle. 4696 * 4697 * Solving this problem completely requires matching the ownership tracking 4698 * granularities between memcg and writeback in either direction. However, 4699 * the more egregious behaviors can be avoided by simply remembering the 4700 * most recent foreign dirtying events and initiating remote flushes on 4701 * them when local writeback isn't enough to keep the memory clean enough. 4702 * 4703 * The following two functions implement such mechanism. When a foreign 4704 * page - a page whose memcg and writeback ownerships don't match - is 4705 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4706 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4707 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4708 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4709 * foreign bdi_writebacks which haven't expired. Both the numbers of 4710 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4711 * limited to MEMCG_CGWB_FRN_CNT. 4712 * 4713 * The mechanism only remembers IDs and doesn't hold any object references. 4714 * As being wrong occasionally doesn't matter, updates and accesses to the 4715 * records are lockless and racy. 4716 */ 4717void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 4718 struct bdi_writeback *wb) 4719{ 4720 struct mem_cgroup *memcg = page->mem_cgroup; 4721 struct memcg_cgwb_frn *frn; 4722 u64 now = get_jiffies_64(); 4723 u64 oldest_at = now; 4724 int oldest = -1; 4725 int i; 4726 4727 trace_track_foreign_dirty(page, wb); 4728 4729 /* 4730 * Pick the slot to use. If there is already a slot for @wb, keep 4731 * using it. If not replace the oldest one which isn't being 4732 * written out. 4733 */ 4734 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4735 frn = &memcg->cgwb_frn[i]; 4736 if (frn->bdi_id == wb->bdi->id && 4737 frn->memcg_id == wb->memcg_css->id) 4738 break; 4739 if (time_before64(frn->at, oldest_at) && 4740 atomic_read(&frn->done.cnt) == 1) { 4741 oldest = i; 4742 oldest_at = frn->at; 4743 } 4744 } 4745 4746 if (i < MEMCG_CGWB_FRN_CNT) { 4747 /* 4748 * Re-using an existing one. Update timestamp lazily to 4749 * avoid making the cacheline hot. We want them to be 4750 * reasonably up-to-date and significantly shorter than 4751 * dirty_expire_interval as that's what expires the record. 4752 * Use the shorter of 1s and dirty_expire_interval / 8. 4753 */ 4754 unsigned long update_intv = 4755 min_t(unsigned long, HZ, 4756 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4757 4758 if (time_before64(frn->at, now - update_intv)) 4759 frn->at = now; 4760 } else if (oldest >= 0) { 4761 /* replace the oldest free one */ 4762 frn = &memcg->cgwb_frn[oldest]; 4763 frn->bdi_id = wb->bdi->id; 4764 frn->memcg_id = wb->memcg_css->id; 4765 frn->at = now; 4766 } 4767} 4768 4769/* issue foreign writeback flushes for recorded foreign dirtying events */ 4770void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4771{ 4772 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4773 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4774 u64 now = jiffies_64; 4775 int i; 4776 4777 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4778 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4779 4780 /* 4781 * If the record is older than dirty_expire_interval, 4782 * writeback on it has already started. No need to kick it 4783 * off again. Also, don't start a new one if there's 4784 * already one in flight. 4785 */ 4786 if (time_after64(frn->at, now - intv) && 4787 atomic_read(&frn->done.cnt) == 1) { 4788 frn->at = 0; 4789 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4790 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, 4791 WB_REASON_FOREIGN_FLUSH, 4792 &frn->done); 4793 } 4794 } 4795} 4796 4797#else /* CONFIG_CGROUP_WRITEBACK */ 4798 4799static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4800{ 4801 return 0; 4802} 4803 4804static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4805{ 4806} 4807 4808static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4809{ 4810} 4811 4812#endif /* CONFIG_CGROUP_WRITEBACK */ 4813 4814/* 4815 * DO NOT USE IN NEW FILES. 4816 * 4817 * "cgroup.event_control" implementation. 4818 * 4819 * This is way over-engineered. It tries to support fully configurable 4820 * events for each user. Such level of flexibility is completely 4821 * unnecessary especially in the light of the planned unified hierarchy. 4822 * 4823 * Please deprecate this and replace with something simpler if at all 4824 * possible. 4825 */ 4826 4827/* 4828 * Unregister event and free resources. 4829 * 4830 * Gets called from workqueue. 4831 */ 4832static void memcg_event_remove(struct work_struct *work) 4833{ 4834 struct mem_cgroup_event *event = 4835 container_of(work, struct mem_cgroup_event, remove); 4836 struct mem_cgroup *memcg = event->memcg; 4837 4838 remove_wait_queue(event->wqh, &event->wait); 4839 4840 event->unregister_event(memcg, event->eventfd); 4841 4842 /* Notify userspace the event is going away. */ 4843 eventfd_signal(event->eventfd, 1); 4844 4845 eventfd_ctx_put(event->eventfd); 4846 kfree(event); 4847 css_put(&memcg->css); 4848} 4849 4850/* 4851 * Gets called on EPOLLHUP on eventfd when user closes it. 4852 * 4853 * Called with wqh->lock held and interrupts disabled. 4854 */ 4855static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4856 int sync, void *key) 4857{ 4858 struct mem_cgroup_event *event = 4859 container_of(wait, struct mem_cgroup_event, wait); 4860 struct mem_cgroup *memcg = event->memcg; 4861 __poll_t flags = key_to_poll(key); 4862 4863 if (flags & EPOLLHUP) { 4864 /* 4865 * If the event has been detached at cgroup removal, we 4866 * can simply return knowing the other side will cleanup 4867 * for us. 4868 * 4869 * We can't race against event freeing since the other 4870 * side will require wqh->lock via remove_wait_queue(), 4871 * which we hold. 4872 */ 4873 spin_lock(&memcg->event_list_lock); 4874 if (!list_empty(&event->list)) { 4875 list_del_init(&event->list); 4876 /* 4877 * We are in atomic context, but cgroup_event_remove() 4878 * may sleep, so we have to call it in workqueue. 4879 */ 4880 schedule_work(&event->remove); 4881 } 4882 spin_unlock(&memcg->event_list_lock); 4883 } 4884 4885 return 0; 4886} 4887 4888static void memcg_event_ptable_queue_proc(struct file *file, 4889 wait_queue_head_t *wqh, poll_table *pt) 4890{ 4891 struct mem_cgroup_event *event = 4892 container_of(pt, struct mem_cgroup_event, pt); 4893 4894 event->wqh = wqh; 4895 add_wait_queue(wqh, &event->wait); 4896} 4897 4898/* 4899 * DO NOT USE IN NEW FILES. 4900 * 4901 * Parse input and register new cgroup event handler. 4902 * 4903 * Input must be in format '<event_fd> <control_fd> <args>'. 4904 * Interpretation of args is defined by control file implementation. 4905 */ 4906static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4907 char *buf, size_t nbytes, loff_t off) 4908{ 4909 struct cgroup_subsys_state *css = of_css(of); 4910 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4911 struct mem_cgroup_event *event; 4912 struct cgroup_subsys_state *cfile_css; 4913 unsigned int efd, cfd; 4914 struct fd efile; 4915 struct fd cfile; 4916 struct dentry *cdentry; 4917 const char *name; 4918 char *endp; 4919 int ret; 4920 4921 buf = strstrip(buf); 4922 4923 efd = simple_strtoul(buf, &endp, 10); 4924 if (*endp != ' ') 4925 return -EINVAL; 4926 buf = endp + 1; 4927 4928 cfd = simple_strtoul(buf, &endp, 10); 4929 if (*endp == '\0') 4930 buf = endp; 4931 else if (*endp == ' ') 4932 buf = endp + 1; 4933 else 4934 return -EINVAL; 4935 4936 event = kzalloc(sizeof(*event), GFP_KERNEL); 4937 if (!event) 4938 return -ENOMEM; 4939 4940 event->memcg = memcg; 4941 INIT_LIST_HEAD(&event->list); 4942 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4943 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4944 INIT_WORK(&event->remove, memcg_event_remove); 4945 4946 efile = fdget(efd); 4947 if (!efile.file) { 4948 ret = -EBADF; 4949 goto out_kfree; 4950 } 4951 4952 event->eventfd = eventfd_ctx_fileget(efile.file); 4953 if (IS_ERR(event->eventfd)) { 4954 ret = PTR_ERR(event->eventfd); 4955 goto out_put_efile; 4956 } 4957 4958 cfile = fdget(cfd); 4959 if (!cfile.file) { 4960 ret = -EBADF; 4961 goto out_put_eventfd; 4962 } 4963 4964 /* the process need read permission on control file */ 4965 /* AV: shouldn't we check that it's been opened for read instead? */ 4966 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4967 if (ret < 0) 4968 goto out_put_cfile; 4969 4970 /* 4971 * The control file must be a regular cgroup1 file. As a regular cgroup 4972 * file can't be renamed, it's safe to access its name afterwards. 4973 */ 4974 cdentry = cfile.file->f_path.dentry; 4975 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 4976 ret = -EINVAL; 4977 goto out_put_cfile; 4978 } 4979 4980 /* 4981 * Determine the event callbacks and set them in @event. This used 4982 * to be done via struct cftype but cgroup core no longer knows 4983 * about these events. The following is crude but the whole thing 4984 * is for compatibility anyway. 4985 * 4986 * DO NOT ADD NEW FILES. 4987 */ 4988 name = cdentry->d_name.name; 4989 4990 if (!strcmp(name, "memory.usage_in_bytes")) { 4991 event->register_event = mem_cgroup_usage_register_event; 4992 event->unregister_event = mem_cgroup_usage_unregister_event; 4993 } else if (!strcmp(name, "memory.oom_control")) { 4994 event->register_event = mem_cgroup_oom_register_event; 4995 event->unregister_event = mem_cgroup_oom_unregister_event; 4996 } else if (!strcmp(name, "memory.pressure_level")) { 4997 event->register_event = vmpressure_register_event; 4998 event->unregister_event = vmpressure_unregister_event; 4999 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 5000 event->register_event = memsw_cgroup_usage_register_event; 5001 event->unregister_event = memsw_cgroup_usage_unregister_event; 5002 } else { 5003 ret = -EINVAL; 5004 goto out_put_cfile; 5005 } 5006 5007 /* 5008 * Verify @cfile should belong to @css. Also, remaining events are 5009 * automatically removed on cgroup destruction but the removal is 5010 * asynchronous, so take an extra ref on @css. 5011 */ 5012 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 5013 &memory_cgrp_subsys); 5014 ret = -EINVAL; 5015 if (IS_ERR(cfile_css)) 5016 goto out_put_cfile; 5017 if (cfile_css != css) { 5018 css_put(cfile_css); 5019 goto out_put_cfile; 5020 } 5021 5022 ret = event->register_event(memcg, event->eventfd, buf); 5023 if (ret) 5024 goto out_put_css; 5025 5026 vfs_poll(efile.file, &event->pt); 5027 5028 spin_lock(&memcg->event_list_lock); 5029 list_add(&event->list, &memcg->event_list); 5030 spin_unlock(&memcg->event_list_lock); 5031 5032 fdput(cfile); 5033 fdput(efile); 5034 5035 return nbytes; 5036 5037out_put_css: 5038 css_put(css); 5039out_put_cfile: 5040 fdput(cfile); 5041out_put_eventfd: 5042 eventfd_ctx_put(event->eventfd); 5043out_put_efile: 5044 fdput(efile); 5045out_kfree: 5046 kfree(event); 5047 5048 return ret; 5049} 5050 5051static struct cftype mem_cgroup_legacy_files[] = { 5052 { 5053 .name = "usage_in_bytes", 5054 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5055 .read_u64 = mem_cgroup_read_u64, 5056 }, 5057 { 5058 .name = "max_usage_in_bytes", 5059 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5060 .write = mem_cgroup_reset, 5061 .read_u64 = mem_cgroup_read_u64, 5062 }, 5063 { 5064 .name = "limit_in_bytes", 5065 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5066 .write = mem_cgroup_write, 5067 .read_u64 = mem_cgroup_read_u64, 5068 }, 5069 { 5070 .name = "soft_limit_in_bytes", 5071 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5072 .write = mem_cgroup_write, 5073 .read_u64 = mem_cgroup_read_u64, 5074 }, 5075 { 5076 .name = "failcnt", 5077 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5078 .write = mem_cgroup_reset, 5079 .read_u64 = mem_cgroup_read_u64, 5080 }, 5081 { 5082 .name = "stat", 5083 .seq_show = memcg_stat_show, 5084 }, 5085 { 5086 .name = "force_empty", 5087 .write = mem_cgroup_force_empty_write, 5088 }, 5089 { 5090 .name = "use_hierarchy", 5091 .write_u64 = mem_cgroup_hierarchy_write, 5092 .read_u64 = mem_cgroup_hierarchy_read, 5093 }, 5094 { 5095 .name = "cgroup.event_control", /* XXX: for compat */ 5096 .write = memcg_write_event_control, 5097 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 5098 }, 5099 { 5100 .name = "swappiness", 5101 .read_u64 = mem_cgroup_swappiness_read, 5102 .write_u64 = mem_cgroup_swappiness_write, 5103 }, 5104 { 5105 .name = "move_charge_at_immigrate", 5106 .read_u64 = mem_cgroup_move_charge_read, 5107 .write_u64 = mem_cgroup_move_charge_write, 5108 }, 5109 { 5110 .name = "oom_control", 5111 .seq_show = mem_cgroup_oom_control_read, 5112 .write_u64 = mem_cgroup_oom_control_write, 5113 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5114 }, 5115 { 5116 .name = "pressure_level", 5117 }, 5118#ifdef CONFIG_NUMA 5119 { 5120 .name = "numa_stat", 5121 .seq_show = memcg_numa_stat_show, 5122 }, 5123#endif 5124 { 5125 .name = "kmem.limit_in_bytes", 5126 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5127 .write = mem_cgroup_write, 5128 .read_u64 = mem_cgroup_read_u64, 5129 }, 5130 { 5131 .name = "kmem.usage_in_bytes", 5132 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5133 .read_u64 = mem_cgroup_read_u64, 5134 }, 5135 { 5136 .name = "kmem.failcnt", 5137 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5138 .write = mem_cgroup_reset, 5139 .read_u64 = mem_cgroup_read_u64, 5140 }, 5141 { 5142 .name = "kmem.max_usage_in_bytes", 5143 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5144 .write = mem_cgroup_reset, 5145 .read_u64 = mem_cgroup_read_u64, 5146 }, 5147#if defined(CONFIG_MEMCG_KMEM) && \ 5148 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5149 { 5150 .name = "kmem.slabinfo", 5151 .seq_show = memcg_slab_show, 5152 }, 5153#endif 5154 { 5155 .name = "kmem.tcp.limit_in_bytes", 5156 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 5157 .write = mem_cgroup_write, 5158 .read_u64 = mem_cgroup_read_u64, 5159 }, 5160 { 5161 .name = "kmem.tcp.usage_in_bytes", 5162 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 5163 .read_u64 = mem_cgroup_read_u64, 5164 }, 5165 { 5166 .name = "kmem.tcp.failcnt", 5167 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 5168 .write = mem_cgroup_reset, 5169 .read_u64 = mem_cgroup_read_u64, 5170 }, 5171 { 5172 .name = "kmem.tcp.max_usage_in_bytes", 5173 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 5174 .write = mem_cgroup_reset, 5175 .read_u64 = mem_cgroup_read_u64, 5176 }, 5177 { }, /* terminate */ 5178}; 5179 5180/* 5181 * Private memory cgroup IDR 5182 * 5183 * Swap-out records and page cache shadow entries need to store memcg 5184 * references in constrained space, so we maintain an ID space that is 5185 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 5186 * memory-controlled cgroups to 64k. 5187 * 5188 * However, there usually are many references to the offline CSS after 5189 * the cgroup has been destroyed, such as page cache or reclaimable 5190 * slab objects, that don't need to hang on to the ID. We want to keep 5191 * those dead CSS from occupying IDs, or we might quickly exhaust the 5192 * relatively small ID space and prevent the creation of new cgroups 5193 * even when there are much fewer than 64k cgroups - possibly none. 5194 * 5195 * Maintain a private 16-bit ID space for memcg, and allow the ID to 5196 * be freed and recycled when it's no longer needed, which is usually 5197 * when the CSS is offlined. 5198 * 5199 * The only exception to that are records of swapped out tmpfs/shmem 5200 * pages that need to be attributed to live ancestors on swapin. But 5201 * those references are manageable from userspace. 5202 */ 5203 5204static DEFINE_IDR(mem_cgroup_idr); 5205static DEFINE_SPINLOCK(memcg_idr_lock); 5206 5207static int mem_cgroup_alloc_id(void) 5208{ 5209 int ret; 5210 5211 idr_preload(GFP_KERNEL); 5212 spin_lock(&memcg_idr_lock); 5213 ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, 5214 GFP_NOWAIT); 5215 spin_unlock(&memcg_idr_lock); 5216 idr_preload_end(); 5217 return ret; 5218} 5219 5220static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 5221{ 5222 if (memcg->id.id > 0) { 5223 spin_lock(&memcg_idr_lock); 5224 idr_remove(&mem_cgroup_idr, memcg->id.id); 5225 spin_unlock(&memcg_idr_lock); 5226 5227 memcg->id.id = 0; 5228 } 5229} 5230 5231static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5232 unsigned int n) 5233{ 5234 refcount_add(n, &memcg->id.ref); 5235} 5236 5237static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5238{ 5239 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5240 mem_cgroup_id_remove(memcg); 5241 5242 /* Memcg ID pins CSS */ 5243 css_put(&memcg->css); 5244 } 5245} 5246 5247static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5248{ 5249 mem_cgroup_id_put_many(memcg, 1); 5250} 5251 5252/** 5253 * mem_cgroup_from_id - look up a memcg from a memcg id 5254 * @id: the memcg id to look up 5255 * 5256 * Caller must hold rcu_read_lock(). 5257 */ 5258struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5259{ 5260 WARN_ON_ONCE(!rcu_read_lock_held()); 5261#ifdef CONFIG_HYPERHOLD_FILE_LRU 5262 if (id == -1) 5263 return NULL; 5264#endif 5265 return idr_find(&mem_cgroup_idr, id); 5266} 5267 5268static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5269{ 5270 struct mem_cgroup_per_node *pn; 5271 int tmp = node; 5272 /* 5273 * This routine is called against possible nodes. 5274 * But it's BUG to call kmalloc() against offline node. 5275 * 5276 * TODO: this routine can waste much memory for nodes which will 5277 * never be onlined. It's better to use memory hotplug callback 5278 * function. 5279 */ 5280 if (!node_state(node, N_NORMAL_MEMORY)) 5281 tmp = -1; 5282 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 5283 if (!pn) 5284 return 1; 5285 5286 pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, 5287 GFP_KERNEL_ACCOUNT); 5288 if (!pn->lruvec_stat_local) { 5289 kfree(pn); 5290 return 1; 5291 } 5292 5293 pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, 5294 GFP_KERNEL_ACCOUNT); 5295 if (!pn->lruvec_stat_cpu) { 5296 free_percpu(pn->lruvec_stat_local); 5297 kfree(pn); 5298 return 1; 5299 } 5300 5301 lruvec_init(&pn->lruvec); 5302 pn->usage_in_excess = 0; 5303 pn->lruvec.pgdat = NODE_DATA(node); 5304 pn->on_tree = false; 5305 pn->memcg = memcg; 5306 5307 memcg->nodeinfo[node] = pn; 5308 return 0; 5309} 5310 5311static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5312{ 5313 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5314 5315 if (!pn) 5316 return; 5317 5318 free_percpu(pn->lruvec_stat_cpu); 5319 free_percpu(pn->lruvec_stat_local); 5320 kfree(pn); 5321} 5322 5323static void __mem_cgroup_free(struct mem_cgroup *memcg) 5324{ 5325 int node; 5326 5327 for_each_node(node) 5328 free_mem_cgroup_per_node_info(memcg, node); 5329 free_percpu(memcg->vmstats_percpu); 5330 free_percpu(memcg->vmstats_local); 5331 kfree(memcg); 5332} 5333 5334static void mem_cgroup_free(struct mem_cgroup *memcg) 5335{ 5336 memcg_wb_domain_exit(memcg); 5337 /* 5338 * Flush percpu vmstats and vmevents to guarantee the value correctness 5339 * on parent's and all ancestor levels. 5340 */ 5341 memcg_flush_percpu_vmstats(memcg); 5342 memcg_flush_percpu_vmevents(memcg); 5343 __mem_cgroup_free(memcg); 5344} 5345 5346static struct mem_cgroup *mem_cgroup_alloc(void) 5347{ 5348 struct mem_cgroup *memcg; 5349 unsigned int size; 5350 int node; 5351 int __maybe_unused i; 5352 long error = -ENOMEM; 5353 5354 size = sizeof(struct mem_cgroup); 5355 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5356 5357 memcg = kzalloc(size, GFP_KERNEL); 5358 if (!memcg) 5359 return ERR_PTR(error); 5360 5361 memcg->id.id = mem_cgroup_alloc_id(); 5362 if (memcg->id.id < 0) { 5363 error = memcg->id.id; 5364 goto fail; 5365 } 5366 5367 memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5368 GFP_KERNEL_ACCOUNT); 5369 if (!memcg->vmstats_local) 5370 goto fail; 5371 5372 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5373 GFP_KERNEL_ACCOUNT); 5374 if (!memcg->vmstats_percpu) 5375 goto fail; 5376 5377 for_each_node(node) 5378 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5379 goto fail; 5380 5381 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5382 goto fail; 5383 5384 INIT_WORK(&memcg->high_work, high_work_func); 5385 INIT_LIST_HEAD(&memcg->oom_notify); 5386 mutex_init(&memcg->thresholds_lock); 5387 spin_lock_init(&memcg->move_lock); 5388 vmpressure_init(&memcg->vmpressure); 5389 INIT_LIST_HEAD(&memcg->event_list); 5390 spin_lock_init(&memcg->event_list_lock); 5391 memcg->socket_pressure = jiffies; 5392#ifdef CONFIG_MEMCG_KMEM 5393 memcg->kmemcg_id = -1; 5394 INIT_LIST_HEAD(&memcg->objcg_list); 5395#endif 5396#ifdef CONFIG_CGROUP_WRITEBACK 5397 INIT_LIST_HEAD(&memcg->cgwb_list); 5398 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5399 memcg->cgwb_frn[i].done = 5400 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5401#endif 5402#ifdef CONFIG_TRANSPARENT_HUGEPAGE 5403 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5404 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5405 memcg->deferred_split_queue.split_queue_len = 0; 5406#endif 5407 5408#ifdef CONFIG_HYPERHOLD_MEMCG 5409 if (unlikely(!score_head_inited)) { 5410 INIT_LIST_HEAD(&score_head); 5411 score_head_inited = true; 5412 } 5413#endif 5414 5415#ifdef CONFIG_HYPERHOLD_MEMCG 5416 INIT_LIST_HEAD(&memcg->score_node); 5417#endif 5418 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5419 return memcg; 5420fail: 5421 mem_cgroup_id_remove(memcg); 5422 __mem_cgroup_free(memcg); 5423 return ERR_PTR(error); 5424} 5425 5426static struct cgroup_subsys_state * __ref 5427mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5428{ 5429 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5430 struct mem_cgroup *memcg, *old_memcg; 5431 long error = -ENOMEM; 5432 5433 old_memcg = set_active_memcg(parent); 5434 memcg = mem_cgroup_alloc(); 5435 set_active_memcg(old_memcg); 5436 if (IS_ERR(memcg)) 5437 return ERR_CAST(memcg); 5438 5439#ifdef CONFIG_HYPERHOLD_MEMCG 5440 atomic64_set(&memcg->memcg_reclaimed.app_score, 300); 5441#endif 5442#ifdef CONFIG_HYPERHOLD_ZSWAPD 5443 atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10); 5444 atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60); 5445 atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50); 5446#endif 5447 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5448 memcg->soft_limit = PAGE_COUNTER_MAX; 5449 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5450 if (parent) { 5451 memcg->swappiness = mem_cgroup_swappiness(parent); 5452 memcg->oom_kill_disable = parent->oom_kill_disable; 5453 } 5454 if (!parent) { 5455 page_counter_init(&memcg->memory, NULL); 5456 page_counter_init(&memcg->swap, NULL); 5457 page_counter_init(&memcg->kmem, NULL); 5458 page_counter_init(&memcg->tcpmem, NULL); 5459 } else if (parent->use_hierarchy) { 5460 memcg->use_hierarchy = true; 5461 page_counter_init(&memcg->memory, &parent->memory); 5462 page_counter_init(&memcg->swap, &parent->swap); 5463 page_counter_init(&memcg->kmem, &parent->kmem); 5464 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5465 } else { 5466 page_counter_init(&memcg->memory, &root_mem_cgroup->memory); 5467 page_counter_init(&memcg->swap, &root_mem_cgroup->swap); 5468 page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); 5469 page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); 5470 /* 5471 * Deeper hierachy with use_hierarchy == false doesn't make 5472 * much sense so let cgroup subsystem know about this 5473 * unfortunate state in our controller. 5474 */ 5475 if (parent != root_mem_cgroup) 5476 memory_cgrp_subsys.broken_hierarchy = true; 5477 } 5478 5479 /* The following stuff does not apply to the root */ 5480 if (!parent) { 5481 root_mem_cgroup = memcg; 5482 return &memcg->css; 5483 } 5484 5485 error = memcg_online_kmem(memcg); 5486 if (error) 5487 goto fail; 5488 5489 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5490 static_branch_inc(&memcg_sockets_enabled_key); 5491 5492 return &memcg->css; 5493fail: 5494 mem_cgroup_id_remove(memcg); 5495 mem_cgroup_free(memcg); 5496 return ERR_PTR(error); 5497} 5498 5499static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5500{ 5501 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5502 5503 /* 5504 * A memcg must be visible for memcg_expand_shrinker_maps() 5505 * by the time the maps are allocated. So, we allocate maps 5506 * here, when for_each_mem_cgroup() can't skip it. 5507 */ 5508 if (memcg_alloc_shrinker_maps(memcg)) { 5509 mem_cgroup_id_remove(memcg); 5510 return -ENOMEM; 5511 } 5512 5513#ifdef CONFIG_HYPERHOLD_MEMCG 5514 memcg_app_score_update(memcg); 5515 css_get(css); 5516#endif 5517 5518 /* Online state pins memcg ID, memcg ID pins CSS */ 5519 refcount_set(&memcg->id.ref, 1); 5520 css_get(css); 5521 return 0; 5522} 5523 5524static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5525{ 5526 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5527 struct mem_cgroup_event *event, *tmp; 5528 5529#ifdef CONFIG_HYPERHOLD_MEMCG 5530 unsigned long flags; 5531 5532 write_lock_irqsave(&score_list_lock, flags); 5533 list_del_init(&memcg->score_node); 5534 write_unlock_irqrestore(&score_list_lock, flags); 5535 css_put(css); 5536#endif 5537 5538 /* 5539 * Unregister events and notify userspace. 5540 * Notify userspace about cgroup removing only after rmdir of cgroup 5541 * directory to avoid race between userspace and kernelspace. 5542 */ 5543 spin_lock(&memcg->event_list_lock); 5544 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5545 list_del_init(&event->list); 5546 schedule_work(&event->remove); 5547 } 5548 spin_unlock(&memcg->event_list_lock); 5549 5550 page_counter_set_min(&memcg->memory, 0); 5551 page_counter_set_low(&memcg->memory, 0); 5552 5553 memcg_offline_kmem(memcg); 5554 wb_memcg_offline(memcg); 5555 5556 drain_all_stock(memcg); 5557 5558 mem_cgroup_id_put(memcg); 5559} 5560 5561static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5562{ 5563 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5564 5565 invalidate_reclaim_iterators(memcg); 5566} 5567 5568static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5569{ 5570 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5571 int __maybe_unused i; 5572 5573#ifdef CONFIG_CGROUP_WRITEBACK 5574 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5575 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5576#endif 5577 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5578 static_branch_dec(&memcg_sockets_enabled_key); 5579 5580 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5581 static_branch_dec(&memcg_sockets_enabled_key); 5582 5583 vmpressure_cleanup(&memcg->vmpressure); 5584 cancel_work_sync(&memcg->high_work); 5585 mem_cgroup_remove_from_trees(memcg); 5586 memcg_free_shrinker_maps(memcg); 5587 memcg_free_kmem(memcg); 5588 mem_cgroup_free(memcg); 5589} 5590 5591/** 5592 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5593 * @css: the target css 5594 * 5595 * Reset the states of the mem_cgroup associated with @css. This is 5596 * invoked when the userland requests disabling on the default hierarchy 5597 * but the memcg is pinned through dependency. The memcg should stop 5598 * applying policies and should revert to the vanilla state as it may be 5599 * made visible again. 5600 * 5601 * The current implementation only resets the essential configurations. 5602 * This needs to be expanded to cover all the visible parts. 5603 */ 5604static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5605{ 5606 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5607 5608 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5609 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5610 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5611 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5612 page_counter_set_min(&memcg->memory, 0); 5613 page_counter_set_low(&memcg->memory, 0); 5614 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5615 memcg->soft_limit = PAGE_COUNTER_MAX; 5616 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5617 memcg_wb_domain_size_changed(memcg); 5618} 5619 5620#ifdef CONFIG_MMU 5621/* Handlers for move charge at task migration. */ 5622static int mem_cgroup_do_precharge(unsigned long count) 5623{ 5624 int ret; 5625 5626 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5627 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5628 if (!ret) { 5629 mc.precharge += count; 5630 return ret; 5631 } 5632 5633 /* Try charges one by one with reclaim, but do not retry */ 5634 while (count--) { 5635 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5636 if (ret) 5637 return ret; 5638 mc.precharge++; 5639 cond_resched(); 5640 } 5641 return 0; 5642} 5643 5644union mc_target { 5645 struct page *page; 5646 swp_entry_t ent; 5647}; 5648 5649enum mc_target_type { 5650 MC_TARGET_NONE = 0, 5651 MC_TARGET_PAGE, 5652 MC_TARGET_SWAP, 5653 MC_TARGET_DEVICE, 5654}; 5655 5656static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5657 unsigned long addr, pte_t ptent) 5658{ 5659 struct page *page = vm_normal_page(vma, addr, ptent); 5660 5661 if (!page || !page_mapped(page)) 5662 return NULL; 5663 if (PageAnon(page)) { 5664 if (!(mc.flags & MOVE_ANON)) 5665 return NULL; 5666 } else { 5667 if (!(mc.flags & MOVE_FILE)) 5668 return NULL; 5669 } 5670 if (!get_page_unless_zero(page)) 5671 return NULL; 5672 5673 return page; 5674} 5675 5676#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5677static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5678 pte_t ptent, swp_entry_t *entry) 5679{ 5680 struct page *page = NULL; 5681 swp_entry_t ent = pte_to_swp_entry(ptent); 5682 5683 if (!(mc.flags & MOVE_ANON)) 5684 return NULL; 5685 5686 /* 5687 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 5688 * a device and because they are not accessible by CPU they are store 5689 * as special swap entry in the CPU page table. 5690 */ 5691 if (is_device_private_entry(ent)) { 5692 page = device_private_entry_to_page(ent); 5693 /* 5694 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 5695 * a refcount of 1 when free (unlike normal page) 5696 */ 5697 if (!page_ref_add_unless(page, 1, 1)) 5698 return NULL; 5699 return page; 5700 } 5701 5702 if (non_swap_entry(ent)) 5703 return NULL; 5704 5705 /* 5706 * Because lookup_swap_cache() updates some statistics counter, 5707 * we call find_get_page() with swapper_space directly. 5708 */ 5709 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5710 entry->val = ent.val; 5711 5712 return page; 5713} 5714#else 5715static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5716 pte_t ptent, swp_entry_t *entry) 5717{ 5718 return NULL; 5719} 5720#endif 5721 5722static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5723 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5724{ 5725 if (!vma->vm_file) /* anonymous vma */ 5726 return NULL; 5727 if (!(mc.flags & MOVE_FILE)) 5728 return NULL; 5729 5730 /* page is moved even if it's not RSS of this task(page-faulted). */ 5731 /* shmem/tmpfs may report page out on swap: account for that too. */ 5732 return find_get_incore_page(vma->vm_file->f_mapping, 5733 linear_page_index(vma, addr)); 5734} 5735 5736/** 5737 * mem_cgroup_move_account - move account of the page 5738 * @page: the page 5739 * @compound: charge the page as compound or small page 5740 * @from: mem_cgroup which the page is moved from. 5741 * @to: mem_cgroup which the page is moved to. @from != @to. 5742 * 5743 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 5744 * 5745 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5746 * from old cgroup. 5747 */ 5748static int mem_cgroup_move_account(struct page *page, 5749 bool compound, 5750 struct mem_cgroup *from, 5751 struct mem_cgroup *to) 5752{ 5753 struct lruvec *from_vec, *to_vec; 5754 struct pglist_data *pgdat; 5755 unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; 5756 int ret; 5757 5758 VM_BUG_ON(from == to); 5759 VM_BUG_ON_PAGE(PageLRU(page), page); 5760 VM_BUG_ON(compound && !PageTransHuge(page)); 5761 5762 /* 5763 * Prevent mem_cgroup_migrate() from looking at 5764 * page->mem_cgroup of its source page while we change it. 5765 */ 5766 ret = -EBUSY; 5767 if (!trylock_page(page)) 5768 goto out; 5769 5770 ret = -EINVAL; 5771 if (page->mem_cgroup != from) 5772 goto out_unlock; 5773 5774 pgdat = page_pgdat(page); 5775 from_vec = mem_cgroup_lruvec(from, pgdat); 5776 to_vec = mem_cgroup_lruvec(to, pgdat); 5777 5778 lock_page_memcg(page); 5779 5780 if (PageAnon(page)) { 5781 if (page_mapped(page)) { 5782 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5783 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5784 if (PageTransHuge(page)) { 5785 __dec_lruvec_state(from_vec, NR_ANON_THPS); 5786 __inc_lruvec_state(to_vec, NR_ANON_THPS); 5787 } 5788 5789 } 5790 } else { 5791 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5792 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5793 5794 if (PageSwapBacked(page)) { 5795 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5796 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5797 } 5798 5799 if (page_mapped(page)) { 5800 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5801 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5802 } 5803 5804 if (PageDirty(page)) { 5805 struct address_space *mapping = page_mapping(page); 5806 5807 if (mapping_can_writeback(mapping)) { 5808 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5809 -nr_pages); 5810 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5811 nr_pages); 5812 } 5813 } 5814 } 5815 5816 if (PageWriteback(page)) { 5817 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5818 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5819 } 5820 5821 /* 5822 * All state has been migrated, let's switch to the new memcg. 5823 * 5824 * It is safe to change page->mem_cgroup here because the page 5825 * is referenced, charged, isolated, and locked: we can't race 5826 * with (un)charging, migration, LRU putback, or anything else 5827 * that would rely on a stable page->mem_cgroup. 5828 * 5829 * Note that lock_page_memcg is a memcg lock, not a page lock, 5830 * to save space. As soon as we switch page->mem_cgroup to a 5831 * new memcg that isn't locked, the above state can change 5832 * concurrently again. Make sure we're truly done with it. 5833 */ 5834 smp_mb(); 5835 5836 css_get(&to->css); 5837 css_put(&from->css); 5838 5839 page->mem_cgroup = to; 5840 5841 __unlock_page_memcg(from); 5842 5843 ret = 0; 5844 5845 local_irq_disable(); 5846 mem_cgroup_charge_statistics(to, page, nr_pages); 5847 memcg_check_events(to, page); 5848 mem_cgroup_charge_statistics(from, page, -nr_pages); 5849 memcg_check_events(from, page); 5850 local_irq_enable(); 5851out_unlock: 5852 unlock_page(page); 5853out: 5854 return ret; 5855} 5856 5857/** 5858 * get_mctgt_type - get target type of moving charge 5859 * @vma: the vma the pte to be checked belongs 5860 * @addr: the address corresponding to the pte to be checked 5861 * @ptent: the pte to be checked 5862 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5863 * 5864 * Returns 5865 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5866 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5867 * move charge. if @target is not NULL, the page is stored in target->page 5868 * with extra refcnt got(Callers should handle it). 5869 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5870 * target for charge migration. if @target is not NULL, the entry is stored 5871 * in target->ent. 5872 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE 5873 * (so ZONE_DEVICE page and thus not on the lru). 5874 * For now we such page is charge like a regular page would be as for all 5875 * intent and purposes it is just special memory taking the place of a 5876 * regular page. 5877 * 5878 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5879 * 5880 * Called with pte lock held. 5881 */ 5882 5883static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5884 unsigned long addr, pte_t ptent, union mc_target *target) 5885{ 5886 struct page *page = NULL; 5887 enum mc_target_type ret = MC_TARGET_NONE; 5888 swp_entry_t ent = { .val = 0 }; 5889 5890 if (pte_present(ptent)) 5891 page = mc_handle_present_pte(vma, addr, ptent); 5892 else if (is_swap_pte(ptent)) 5893 page = mc_handle_swap_pte(vma, ptent, &ent); 5894 else if (pte_none(ptent)) 5895 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5896 5897 if (!page && !ent.val) 5898 return ret; 5899 if (page) { 5900 /* 5901 * Do only loose check w/o serialization. 5902 * mem_cgroup_move_account() checks the page is valid or 5903 * not under LRU exclusion. 5904 */ 5905 if (page->mem_cgroup == mc.from) { 5906 ret = MC_TARGET_PAGE; 5907 if (is_device_private_page(page)) 5908 ret = MC_TARGET_DEVICE; 5909 if (target) 5910 target->page = page; 5911 } 5912 if (!ret || !target) 5913 put_page(page); 5914 } 5915 /* 5916 * There is a swap entry and a page doesn't exist or isn't charged. 5917 * But we cannot move a tail-page in a THP. 5918 */ 5919 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5920 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5921 ret = MC_TARGET_SWAP; 5922 if (target) 5923 target->ent = ent; 5924 } 5925 return ret; 5926} 5927 5928#ifdef CONFIG_TRANSPARENT_HUGEPAGE 5929/* 5930 * We don't consider PMD mapped swapping or file mapped pages because THP does 5931 * not support them for now. 5932 * Caller should make sure that pmd_trans_huge(pmd) is true. 5933 */ 5934static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5935 unsigned long addr, pmd_t pmd, union mc_target *target) 5936{ 5937 struct page *page = NULL; 5938 enum mc_target_type ret = MC_TARGET_NONE; 5939 5940 if (unlikely(is_swap_pmd(pmd))) { 5941 VM_BUG_ON(thp_migration_supported() && 5942 !is_pmd_migration_entry(pmd)); 5943 return ret; 5944 } 5945 page = pmd_page(pmd); 5946 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5947 if (!(mc.flags & MOVE_ANON)) 5948 return ret; 5949 if (page->mem_cgroup == mc.from) { 5950 ret = MC_TARGET_PAGE; 5951 if (target) { 5952 get_page(page); 5953 target->page = page; 5954 } 5955 } 5956 return ret; 5957} 5958#else 5959static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5960 unsigned long addr, pmd_t pmd, union mc_target *target) 5961{ 5962 return MC_TARGET_NONE; 5963} 5964#endif 5965 5966static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5967 unsigned long addr, unsigned long end, 5968 struct mm_walk *walk) 5969{ 5970 struct vm_area_struct *vma = walk->vma; 5971 pte_t *pte; 5972 spinlock_t *ptl; 5973 5974 ptl = pmd_trans_huge_lock(pmd, vma); 5975 if (ptl) { 5976 /* 5977 * Note their can not be MC_TARGET_DEVICE for now as we do not 5978 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5979 * this might change. 5980 */ 5981 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5982 mc.precharge += HPAGE_PMD_NR; 5983 spin_unlock(ptl); 5984 return 0; 5985 } 5986 5987 if (pmd_trans_unstable(pmd)) 5988 return 0; 5989 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5990 for (; addr != end; pte++, addr += PAGE_SIZE) 5991 if (get_mctgt_type(vma, addr, *pte, NULL)) 5992 mc.precharge++; /* increment precharge temporarily */ 5993 pte_unmap_unlock(pte - 1, ptl); 5994 cond_resched(); 5995 5996 return 0; 5997} 5998 5999static const struct mm_walk_ops precharge_walk_ops = { 6000 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6001}; 6002 6003static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6004{ 6005 unsigned long precharge; 6006 6007 mmap_read_lock(mm); 6008 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 6009 mmap_read_unlock(mm); 6010 6011 precharge = mc.precharge; 6012 mc.precharge = 0; 6013 6014 return precharge; 6015} 6016 6017static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6018{ 6019 unsigned long precharge = mem_cgroup_count_precharge(mm); 6020 6021 VM_BUG_ON(mc.moving_task); 6022 mc.moving_task = current; 6023 return mem_cgroup_do_precharge(precharge); 6024} 6025 6026/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6027static void __mem_cgroup_clear_mc(void) 6028{ 6029 struct mem_cgroup *from = mc.from; 6030 struct mem_cgroup *to = mc.to; 6031 6032 /* we must uncharge all the leftover precharges from mc.to */ 6033 if (mc.precharge) { 6034 cancel_charge(mc.to, mc.precharge); 6035 mc.precharge = 0; 6036 } 6037 /* 6038 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6039 * we must uncharge here. 6040 */ 6041 if (mc.moved_charge) { 6042 cancel_charge(mc.from, mc.moved_charge); 6043 mc.moved_charge = 0; 6044 } 6045 /* we must fixup refcnts and charges */ 6046 if (mc.moved_swap) { 6047 /* uncharge swap account from the old cgroup */ 6048 if (!mem_cgroup_is_root(mc.from)) 6049 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 6050 6051 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 6052 6053 /* 6054 * we charged both to->memory and to->memsw, so we 6055 * should uncharge to->memory. 6056 */ 6057 if (!mem_cgroup_is_root(mc.to)) 6058 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 6059 6060 mc.moved_swap = 0; 6061 } 6062 memcg_oom_recover(from); 6063 memcg_oom_recover(to); 6064 wake_up_all(&mc.waitq); 6065} 6066 6067static void mem_cgroup_clear_mc(void) 6068{ 6069 struct mm_struct *mm = mc.mm; 6070 6071 /* 6072 * we must clear moving_task before waking up waiters at the end of 6073 * task migration. 6074 */ 6075 mc.moving_task = NULL; 6076 __mem_cgroup_clear_mc(); 6077 spin_lock(&mc.lock); 6078 mc.from = NULL; 6079 mc.to = NULL; 6080 mc.mm = NULL; 6081 spin_unlock(&mc.lock); 6082 6083 mmput(mm); 6084} 6085 6086static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6087{ 6088 struct cgroup_subsys_state *css; 6089 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 6090 struct mem_cgroup *from; 6091 struct task_struct *leader, *p; 6092 struct mm_struct *mm; 6093 unsigned long move_flags; 6094 int ret = 0; 6095 6096 /* charge immigration isn't supported on the default hierarchy */ 6097 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6098 return 0; 6099 6100 /* 6101 * Multi-process migrations only happen on the default hierarchy 6102 * where charge immigration is not used. Perform charge 6103 * immigration if @tset contains a leader and whine if there are 6104 * multiple. 6105 */ 6106 p = NULL; 6107 cgroup_taskset_for_each_leader(leader, css, tset) { 6108 WARN_ON_ONCE(p); 6109 p = leader; 6110 memcg = mem_cgroup_from_css(css); 6111 } 6112 if (!p) 6113 return 0; 6114 6115 /* 6116 * We are now commited to this value whatever it is. Changes in this 6117 * tunable will only affect upcoming migrations, not the current one. 6118 * So we need to save it, and keep it going. 6119 */ 6120 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 6121 if (!move_flags) 6122 return 0; 6123 6124 from = mem_cgroup_from_task(p); 6125 6126 VM_BUG_ON(from == memcg); 6127 6128 mm = get_task_mm(p); 6129 if (!mm) 6130 return 0; 6131 /* We move charges only when we move a owner of the mm */ 6132 if (mm->owner == p) { 6133 VM_BUG_ON(mc.from); 6134 VM_BUG_ON(mc.to); 6135 VM_BUG_ON(mc.precharge); 6136 VM_BUG_ON(mc.moved_charge); 6137 VM_BUG_ON(mc.moved_swap); 6138 6139 spin_lock(&mc.lock); 6140 mc.mm = mm; 6141 mc.from = from; 6142 mc.to = memcg; 6143 mc.flags = move_flags; 6144 spin_unlock(&mc.lock); 6145 /* We set mc.moving_task later */ 6146 6147 ret = mem_cgroup_precharge_mc(mm); 6148 if (ret) 6149 mem_cgroup_clear_mc(); 6150 } else { 6151 mmput(mm); 6152 } 6153 return ret; 6154} 6155 6156static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6157{ 6158 if (mc.to) 6159 mem_cgroup_clear_mc(); 6160} 6161 6162static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6163 unsigned long addr, unsigned long end, 6164 struct mm_walk *walk) 6165{ 6166 int ret = 0; 6167 struct vm_area_struct *vma = walk->vma; 6168 pte_t *pte; 6169 spinlock_t *ptl; 6170 enum mc_target_type target_type; 6171 union mc_target target; 6172 struct page *page; 6173 6174 ptl = pmd_trans_huge_lock(pmd, vma); 6175 if (ptl) { 6176 if (mc.precharge < HPAGE_PMD_NR) { 6177 spin_unlock(ptl); 6178 return 0; 6179 } 6180 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6181 if (target_type == MC_TARGET_PAGE) { 6182 page = target.page; 6183 if (!isolate_lru_page(page)) { 6184 if (!mem_cgroup_move_account(page, true, 6185 mc.from, mc.to)) { 6186 mc.precharge -= HPAGE_PMD_NR; 6187 mc.moved_charge += HPAGE_PMD_NR; 6188 } 6189 putback_lru_page(page); 6190 } 6191 put_page(page); 6192 } else if (target_type == MC_TARGET_DEVICE) { 6193 page = target.page; 6194 if (!mem_cgroup_move_account(page, true, 6195 mc.from, mc.to)) { 6196 mc.precharge -= HPAGE_PMD_NR; 6197 mc.moved_charge += HPAGE_PMD_NR; 6198 } 6199 put_page(page); 6200 } 6201 spin_unlock(ptl); 6202 return 0; 6203 } 6204 6205 if (pmd_trans_unstable(pmd)) 6206 return 0; 6207retry: 6208 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6209 for (; addr != end; addr += PAGE_SIZE) { 6210 pte_t ptent = *(pte++); 6211 bool device = false; 6212 swp_entry_t ent; 6213 6214 if (!mc.precharge) 6215 break; 6216 6217 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6218 case MC_TARGET_DEVICE: 6219 device = true; 6220 fallthrough; 6221 case MC_TARGET_PAGE: 6222 page = target.page; 6223 /* 6224 * We can have a part of the split pmd here. Moving it 6225 * can be done but it would be too convoluted so simply 6226 * ignore such a partial THP and keep it in original 6227 * memcg. There should be somebody mapping the head. 6228 */ 6229 if (PageTransCompound(page)) 6230 goto put; 6231 if (!device && isolate_lru_page(page)) 6232 goto put; 6233 if (!mem_cgroup_move_account(page, false, 6234 mc.from, mc.to)) { 6235 mc.precharge--; 6236 /* we uncharge from mc.from later. */ 6237 mc.moved_charge++; 6238 } 6239 if (!device) 6240 putback_lru_page(page); 6241put: /* get_mctgt_type() gets the page */ 6242 put_page(page); 6243 break; 6244 case MC_TARGET_SWAP: 6245 ent = target.ent; 6246 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6247 mc.precharge--; 6248 mem_cgroup_id_get_many(mc.to, 1); 6249 /* we fixup other refcnts and charges later. */ 6250 mc.moved_swap++; 6251 } 6252 break; 6253 default: 6254 break; 6255 } 6256 } 6257 pte_unmap_unlock(pte - 1, ptl); 6258 cond_resched(); 6259 6260 if (addr != end) { 6261 /* 6262 * We have consumed all precharges we got in can_attach(). 6263 * We try charge one by one, but don't do any additional 6264 * charges to mc.to if we have failed in charge once in attach() 6265 * phase. 6266 */ 6267 ret = mem_cgroup_do_precharge(1); 6268 if (!ret) 6269 goto retry; 6270 } 6271 6272 return ret; 6273} 6274 6275static const struct mm_walk_ops charge_walk_ops = { 6276 .pmd_entry = mem_cgroup_move_charge_pte_range, 6277}; 6278 6279static void mem_cgroup_move_charge(void) 6280{ 6281 lru_add_drain_all(); 6282 /* 6283 * Signal lock_page_memcg() to take the memcg's move_lock 6284 * while we're moving its pages to another memcg. Then wait 6285 * for already started RCU-only updates to finish. 6286 */ 6287 atomic_inc(&mc.from->moving_account); 6288 synchronize_rcu(); 6289retry: 6290 if (unlikely(!mmap_read_trylock(mc.mm))) { 6291 /* 6292 * Someone who are holding the mmap_lock might be waiting in 6293 * waitq. So we cancel all extra charges, wake up all waiters, 6294 * and retry. Because we cancel precharges, we might not be able 6295 * to move enough charges, but moving charge is a best-effort 6296 * feature anyway, so it wouldn't be a big problem. 6297 */ 6298 __mem_cgroup_clear_mc(); 6299 cond_resched(); 6300 goto retry; 6301 } 6302 /* 6303 * When we have consumed all precharges and failed in doing 6304 * additional charge, the page walk just aborts. 6305 */ 6306 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 6307 NULL); 6308 6309 mmap_read_unlock(mc.mm); 6310 atomic_dec(&mc.from->moving_account); 6311} 6312 6313static void mem_cgroup_move_task(void) 6314{ 6315 if (mc.to) { 6316 mem_cgroup_move_charge(); 6317 mem_cgroup_clear_mc(); 6318 } 6319} 6320#else /* !CONFIG_MMU */ 6321static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6322{ 6323 return 0; 6324} 6325static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6326{ 6327} 6328static void mem_cgroup_move_task(void) 6329{ 6330} 6331#endif 6332 6333/* 6334 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6335 * to verify whether we're attached to the default hierarchy on each mount 6336 * attempt. 6337 */ 6338static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 6339{ 6340 /* 6341 * use_hierarchy is forced on the default hierarchy. cgroup core 6342 * guarantees that @root doesn't have any children, so turning it 6343 * on for the root memcg is enough. 6344 */ 6345 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6346 root_mem_cgroup->use_hierarchy = true; 6347 else 6348 root_mem_cgroup->use_hierarchy = false; 6349} 6350 6351static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6352{ 6353 if (value == PAGE_COUNTER_MAX) 6354 seq_puts(m, "max\n"); 6355 else 6356 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6357 6358 return 0; 6359} 6360 6361static u64 memory_current_read(struct cgroup_subsys_state *css, 6362 struct cftype *cft) 6363{ 6364 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6365 6366 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6367} 6368 6369static int memory_min_show(struct seq_file *m, void *v) 6370{ 6371 return seq_puts_memcg_tunable(m, 6372 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6373} 6374 6375static ssize_t memory_min_write(struct kernfs_open_file *of, 6376 char *buf, size_t nbytes, loff_t off) 6377{ 6378 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6379 unsigned long min; 6380 int err; 6381 6382 buf = strstrip(buf); 6383 err = page_counter_memparse(buf, "max", &min); 6384 if (err) 6385 return err; 6386 6387 page_counter_set_min(&memcg->memory, min); 6388 6389 return nbytes; 6390} 6391 6392static int memory_low_show(struct seq_file *m, void *v) 6393{ 6394 return seq_puts_memcg_tunable(m, 6395 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6396} 6397 6398static ssize_t memory_low_write(struct kernfs_open_file *of, 6399 char *buf, size_t nbytes, loff_t off) 6400{ 6401 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6402 unsigned long low; 6403 int err; 6404 6405 buf = strstrip(buf); 6406 err = page_counter_memparse(buf, "max", &low); 6407 if (err) 6408 return err; 6409 6410 page_counter_set_low(&memcg->memory, low); 6411 6412 return nbytes; 6413} 6414 6415static int memory_high_show(struct seq_file *m, void *v) 6416{ 6417 return seq_puts_memcg_tunable(m, 6418 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6419} 6420 6421static ssize_t memory_high_write(struct kernfs_open_file *of, 6422 char *buf, size_t nbytes, loff_t off) 6423{ 6424 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6425 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6426 bool drained = false; 6427 unsigned long high; 6428 int err; 6429 6430 buf = strstrip(buf); 6431 err = page_counter_memparse(buf, "max", &high); 6432 if (err) 6433 return err; 6434 6435 page_counter_set_high(&memcg->memory, high); 6436 6437 for (;;) { 6438 unsigned long nr_pages = page_counter_read(&memcg->memory); 6439 unsigned long reclaimed; 6440 6441 if (nr_pages <= high) 6442 break; 6443 6444 if (signal_pending(current)) 6445 break; 6446 6447 if (!drained) { 6448 drain_all_stock(memcg); 6449 drained = true; 6450 continue; 6451 } 6452 6453 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6454 GFP_KERNEL, true); 6455 6456 if (!reclaimed && !nr_retries--) 6457 break; 6458 } 6459 6460 memcg_wb_domain_size_changed(memcg); 6461 return nbytes; 6462} 6463 6464static int memory_max_show(struct seq_file *m, void *v) 6465{ 6466 return seq_puts_memcg_tunable(m, 6467 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6468} 6469 6470static ssize_t memory_max_write(struct kernfs_open_file *of, 6471 char *buf, size_t nbytes, loff_t off) 6472{ 6473 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6474 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6475 bool drained = false; 6476 unsigned long max; 6477 int err; 6478 6479 buf = strstrip(buf); 6480 err = page_counter_memparse(buf, "max", &max); 6481 if (err) 6482 return err; 6483 6484 xchg(&memcg->memory.max, max); 6485 6486 for (;;) { 6487 unsigned long nr_pages = page_counter_read(&memcg->memory); 6488 6489 if (nr_pages <= max) 6490 break; 6491 6492 if (signal_pending(current)) 6493 break; 6494 6495 if (!drained) { 6496 drain_all_stock(memcg); 6497 drained = true; 6498 continue; 6499 } 6500 6501 if (nr_reclaims) { 6502 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6503 GFP_KERNEL, true)) 6504 nr_reclaims--; 6505 continue; 6506 } 6507 6508 memcg_memory_event(memcg, MEMCG_OOM); 6509 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6510 break; 6511 } 6512 6513 memcg_wb_domain_size_changed(memcg); 6514 return nbytes; 6515} 6516 6517static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6518{ 6519 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6520 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6521 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6522 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6523 seq_printf(m, "oom_kill %lu\n", 6524 atomic_long_read(&events[MEMCG_OOM_KILL])); 6525} 6526 6527static int memory_events_show(struct seq_file *m, void *v) 6528{ 6529 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6530 6531 __memory_events_show(m, memcg->memory_events); 6532 return 0; 6533} 6534 6535static int memory_events_local_show(struct seq_file *m, void *v) 6536{ 6537 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6538 6539 __memory_events_show(m, memcg->memory_events_local); 6540 return 0; 6541} 6542 6543static int memory_stat_show(struct seq_file *m, void *v) 6544{ 6545 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6546 char *buf; 6547 6548 buf = memory_stat_format(memcg); 6549 if (!buf) 6550 return -ENOMEM; 6551 seq_puts(m, buf); 6552 kfree(buf); 6553 return 0; 6554} 6555 6556#ifdef CONFIG_NUMA 6557static int memory_numa_stat_show(struct seq_file *m, void *v) 6558{ 6559 int i; 6560 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6561 6562 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6563 int nid; 6564 6565 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6566 continue; 6567 6568 seq_printf(m, "%s", memory_stats[i].name); 6569 for_each_node_state(nid, N_MEMORY) { 6570 u64 size; 6571 struct lruvec *lruvec; 6572 6573 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6574 size = lruvec_page_state(lruvec, memory_stats[i].idx); 6575 size *= memory_stats[i].ratio; 6576 seq_printf(m, " N%d=%llu", nid, size); 6577 } 6578 seq_putc(m, '\n'); 6579 } 6580 6581 return 0; 6582} 6583#endif 6584 6585static int memory_oom_group_show(struct seq_file *m, void *v) 6586{ 6587 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6588 6589 seq_printf(m, "%d\n", memcg->oom_group); 6590 6591 return 0; 6592} 6593 6594static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6595 char *buf, size_t nbytes, loff_t off) 6596{ 6597 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6598 int ret, oom_group; 6599 6600 buf = strstrip(buf); 6601 if (!buf) 6602 return -EINVAL; 6603 6604 ret = kstrtoint(buf, 0, &oom_group); 6605 if (ret) 6606 return ret; 6607 6608 if (oom_group != 0 && oom_group != 1) 6609 return -EINVAL; 6610 6611 memcg->oom_group = oom_group; 6612 6613 return nbytes; 6614} 6615 6616static struct cftype memory_files[] = { 6617 { 6618 .name = "current", 6619 .flags = CFTYPE_NOT_ON_ROOT, 6620 .read_u64 = memory_current_read, 6621 }, 6622 { 6623 .name = "min", 6624 .flags = CFTYPE_NOT_ON_ROOT, 6625 .seq_show = memory_min_show, 6626 .write = memory_min_write, 6627 }, 6628 { 6629 .name = "low", 6630 .flags = CFTYPE_NOT_ON_ROOT, 6631 .seq_show = memory_low_show, 6632 .write = memory_low_write, 6633 }, 6634 { 6635 .name = "high", 6636 .flags = CFTYPE_NOT_ON_ROOT, 6637 .seq_show = memory_high_show, 6638 .write = memory_high_write, 6639 }, 6640 { 6641 .name = "max", 6642 .flags = CFTYPE_NOT_ON_ROOT, 6643 .seq_show = memory_max_show, 6644 .write = memory_max_write, 6645 }, 6646 { 6647 .name = "events", 6648 .flags = CFTYPE_NOT_ON_ROOT, 6649 .file_offset = offsetof(struct mem_cgroup, events_file), 6650 .seq_show = memory_events_show, 6651 }, 6652 { 6653 .name = "events.local", 6654 .flags = CFTYPE_NOT_ON_ROOT, 6655 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6656 .seq_show = memory_events_local_show, 6657 }, 6658 { 6659 .name = "stat", 6660 .seq_show = memory_stat_show, 6661 }, 6662#ifdef CONFIG_NUMA 6663 { 6664 .name = "numa_stat", 6665 .seq_show = memory_numa_stat_show, 6666 }, 6667#endif 6668 { 6669 .name = "oom.group", 6670 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6671 .seq_show = memory_oom_group_show, 6672 .write = memory_oom_group_write, 6673 }, 6674 { } /* terminate */ 6675}; 6676 6677struct cgroup_subsys memory_cgrp_subsys = { 6678 .css_alloc = mem_cgroup_css_alloc, 6679 .css_online = mem_cgroup_css_online, 6680 .css_offline = mem_cgroup_css_offline, 6681 .css_released = mem_cgroup_css_released, 6682 .css_free = mem_cgroup_css_free, 6683 .css_reset = mem_cgroup_css_reset, 6684 .can_attach = mem_cgroup_can_attach, 6685 .cancel_attach = mem_cgroup_cancel_attach, 6686 .post_attach = mem_cgroup_move_task, 6687 .bind = mem_cgroup_bind, 6688 .dfl_cftypes = memory_files, 6689 .legacy_cftypes = mem_cgroup_legacy_files, 6690 .early_init = 0, 6691}; 6692 6693/* 6694 * This function calculates an individual cgroup's effective 6695 * protection which is derived from its own memory.min/low, its 6696 * parent's and siblings' settings, as well as the actual memory 6697 * distribution in the tree. 6698 * 6699 * The following rules apply to the effective protection values: 6700 * 6701 * 1. At the first level of reclaim, effective protection is equal to 6702 * the declared protection in memory.min and memory.low. 6703 * 6704 * 2. To enable safe delegation of the protection configuration, at 6705 * subsequent levels the effective protection is capped to the 6706 * parent's effective protection. 6707 * 6708 * 3. To make complex and dynamic subtrees easier to configure, the 6709 * user is allowed to overcommit the declared protection at a given 6710 * level. If that is the case, the parent's effective protection is 6711 * distributed to the children in proportion to how much protection 6712 * they have declared and how much of it they are utilizing. 6713 * 6714 * This makes distribution proportional, but also work-conserving: 6715 * if one cgroup claims much more protection than it uses memory, 6716 * the unused remainder is available to its siblings. 6717 * 6718 * 4. Conversely, when the declared protection is undercommitted at a 6719 * given level, the distribution of the larger parental protection 6720 * budget is NOT proportional. A cgroup's protection from a sibling 6721 * is capped to its own memory.min/low setting. 6722 * 6723 * 5. However, to allow protecting recursive subtrees from each other 6724 * without having to declare each individual cgroup's fixed share 6725 * of the ancestor's claim to protection, any unutilized - 6726 * "floating" - protection from up the tree is distributed in 6727 * proportion to each cgroup's *usage*. This makes the protection 6728 * neutral wrt sibling cgroups and lets them compete freely over 6729 * the shared parental protection budget, but it protects the 6730 * subtree as a whole from neighboring subtrees. 6731 * 6732 * Note that 4. and 5. are not in conflict: 4. is about protecting 6733 * against immediate siblings whereas 5. is about protecting against 6734 * neighboring subtrees. 6735 */ 6736static unsigned long effective_protection(unsigned long usage, 6737 unsigned long parent_usage, 6738 unsigned long setting, 6739 unsigned long parent_effective, 6740 unsigned long siblings_protected) 6741{ 6742 unsigned long protected; 6743 unsigned long ep; 6744 6745 protected = min(usage, setting); 6746 /* 6747 * If all cgroups at this level combined claim and use more 6748 * protection then what the parent affords them, distribute 6749 * shares in proportion to utilization. 6750 * 6751 * We are using actual utilization rather than the statically 6752 * claimed protection in order to be work-conserving: claimed 6753 * but unused protection is available to siblings that would 6754 * otherwise get a smaller chunk than what they claimed. 6755 */ 6756 if (siblings_protected > parent_effective) 6757 return protected * parent_effective / siblings_protected; 6758 6759 /* 6760 * Ok, utilized protection of all children is within what the 6761 * parent affords them, so we know whatever this child claims 6762 * and utilizes is effectively protected. 6763 * 6764 * If there is unprotected usage beyond this value, reclaim 6765 * will apply pressure in proportion to that amount. 6766 * 6767 * If there is unutilized protection, the cgroup will be fully 6768 * shielded from reclaim, but we do return a smaller value for 6769 * protection than what the group could enjoy in theory. This 6770 * is okay. With the overcommit distribution above, effective 6771 * protection is always dependent on how memory is actually 6772 * consumed among the siblings anyway. 6773 */ 6774 ep = protected; 6775 6776 /* 6777 * If the children aren't claiming (all of) the protection 6778 * afforded to them by the parent, distribute the remainder in 6779 * proportion to the (unprotected) memory of each cgroup. That 6780 * way, cgroups that aren't explicitly prioritized wrt each 6781 * other compete freely over the allowance, but they are 6782 * collectively protected from neighboring trees. 6783 * 6784 * We're using unprotected memory for the weight so that if 6785 * some cgroups DO claim explicit protection, we don't protect 6786 * the same bytes twice. 6787 * 6788 * Check both usage and parent_usage against the respective 6789 * protected values. One should imply the other, but they 6790 * aren't read atomically - make sure the division is sane. 6791 */ 6792 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6793 return ep; 6794 if (parent_effective > siblings_protected && 6795 parent_usage > siblings_protected && 6796 usage > protected) { 6797 unsigned long unclaimed; 6798 6799 unclaimed = parent_effective - siblings_protected; 6800 unclaimed *= usage - protected; 6801 unclaimed /= parent_usage - siblings_protected; 6802 6803 ep += unclaimed; 6804 } 6805 6806 return ep; 6807} 6808 6809/** 6810 * mem_cgroup_protected - check if memory consumption is in the normal range 6811 * @root: the top ancestor of the sub-tree being checked 6812 * @memcg: the memory cgroup to check 6813 * 6814 * WARNING: This function is not stateless! It can only be used as part 6815 * of a top-down tree iteration, not for isolated queries. 6816 */ 6817void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6818 struct mem_cgroup *memcg) 6819{ 6820 unsigned long usage, parent_usage; 6821 struct mem_cgroup *parent; 6822 6823 if (mem_cgroup_disabled()) 6824 return; 6825 6826 if (!root) 6827 root = root_mem_cgroup; 6828 6829 /* 6830 * Effective values of the reclaim targets are ignored so they 6831 * can be stale. Have a look at mem_cgroup_protection for more 6832 * details. 6833 * TODO: calculation should be more robust so that we do not need 6834 * that special casing. 6835 */ 6836 if (memcg == root) 6837 return; 6838 6839 usage = page_counter_read(&memcg->memory); 6840 if (!usage) 6841 return; 6842 6843 parent = parent_mem_cgroup(memcg); 6844 /* No parent means a non-hierarchical mode on v1 memcg */ 6845 if (!parent) 6846 return; 6847 6848 if (parent == root) { 6849 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6850 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6851 return; 6852 } 6853 6854 parent_usage = page_counter_read(&parent->memory); 6855 6856 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6857 READ_ONCE(memcg->memory.min), 6858 READ_ONCE(parent->memory.emin), 6859 atomic_long_read(&parent->memory.children_min_usage))); 6860 6861 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6862 READ_ONCE(memcg->memory.low), 6863 READ_ONCE(parent->memory.elow), 6864 atomic_long_read(&parent->memory.children_low_usage))); 6865} 6866 6867/** 6868 * mem_cgroup_charge - charge a newly allocated page to a cgroup 6869 * @page: page to charge 6870 * @mm: mm context of the victim 6871 * @gfp_mask: reclaim mode 6872 * 6873 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6874 * pages according to @gfp_mask if necessary. 6875 * 6876 * Returns 0 on success. Otherwise, an error code is returned. 6877 */ 6878int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 6879{ 6880 unsigned int nr_pages = thp_nr_pages(page); 6881 struct mem_cgroup *memcg = NULL; 6882 int ret = 0; 6883 6884 if (mem_cgroup_disabled()) 6885 goto out; 6886 6887 if (PageSwapCache(page)) { 6888 swp_entry_t ent = { .val = page_private(page), }; 6889 unsigned short id; 6890 6891 /* 6892 * Every swap fault against a single page tries to charge the 6893 * page, bail as early as possible. shmem_unuse() encounters 6894 * already charged pages, too. page->mem_cgroup is protected 6895 * by the page lock, which serializes swap cache removal, which 6896 * in turn serializes uncharging. 6897 */ 6898 VM_BUG_ON_PAGE(!PageLocked(page), page); 6899 if (compound_head(page)->mem_cgroup) 6900 goto out; 6901 6902 id = lookup_swap_cgroup_id(ent); 6903 rcu_read_lock(); 6904 memcg = mem_cgroup_from_id(id); 6905 if (memcg && !css_tryget_online(&memcg->css)) 6906 memcg = NULL; 6907 rcu_read_unlock(); 6908 } 6909 6910 if (!memcg) 6911 memcg = get_mem_cgroup_from_mm(mm); 6912 6913 ret = try_charge(memcg, gfp_mask, nr_pages); 6914 if (ret) 6915 goto out_put; 6916 6917 css_get(&memcg->css); 6918 commit_charge(page, memcg); 6919 6920 local_irq_disable(); 6921 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6922 memcg_check_events(memcg, page); 6923 local_irq_enable(); 6924 6925 /* 6926 * Cgroup1's unified memory+swap counter has been charged with the 6927 * new swapcache page, finish the transfer by uncharging the swap 6928 * slot. The swap slot would also get uncharged when it dies, but 6929 * it can stick around indefinitely and we'd count the page twice 6930 * the entire time. 6931 * 6932 * Cgroup2 has separate resource counters for memory and swap, 6933 * so this is a non-issue here. Memory and swap charge lifetimes 6934 * correspond 1:1 to page and swap slot lifetimes: we charge the 6935 * page to memory here, and uncharge swap when the slot is freed. 6936 */ 6937 if (do_memsw_account() && PageSwapCache(page)) { 6938 swp_entry_t entry = { .val = page_private(page) }; 6939 /* 6940 * The swap entry might not get freed for a long time, 6941 * let's not wait for it. The page already received a 6942 * memory+swap charge, drop the swap entry duplicate. 6943 */ 6944 mem_cgroup_uncharge_swap(entry, nr_pages); 6945 } 6946 6947out_put: 6948 css_put(&memcg->css); 6949out: 6950 return ret; 6951} 6952 6953struct uncharge_gather { 6954 struct mem_cgroup *memcg; 6955 unsigned long nr_pages; 6956 unsigned long pgpgout; 6957 unsigned long nr_kmem; 6958 struct page *dummy_page; 6959}; 6960 6961static inline void uncharge_gather_clear(struct uncharge_gather *ug) 6962{ 6963 memset(ug, 0, sizeof(*ug)); 6964} 6965 6966static void uncharge_batch(const struct uncharge_gather *ug) 6967{ 6968 unsigned long flags; 6969 6970 if (!mem_cgroup_is_root(ug->memcg)) { 6971 page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); 6972 if (do_memsw_account()) 6973 page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); 6974 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 6975 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 6976 memcg_oom_recover(ug->memcg); 6977 } 6978 6979 local_irq_save(flags); 6980 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 6981 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); 6982 memcg_check_events(ug->memcg, ug->dummy_page); 6983 local_irq_restore(flags); 6984 6985 /* drop reference from uncharge_page */ 6986 css_put(&ug->memcg->css); 6987} 6988 6989static void uncharge_page(struct page *page, struct uncharge_gather *ug) 6990{ 6991 unsigned long nr_pages; 6992 6993 VM_BUG_ON_PAGE(PageLRU(page), page); 6994 6995 if (!page->mem_cgroup) 6996 return; 6997 6998 /* 6999 * Nobody should be changing or seriously looking at 7000 * page->mem_cgroup at this point, we have fully 7001 * exclusive access to the page. 7002 */ 7003 7004 if (ug->memcg != page->mem_cgroup) { 7005 if (ug->memcg) { 7006 uncharge_batch(ug); 7007 uncharge_gather_clear(ug); 7008 } 7009 ug->memcg = page->mem_cgroup; 7010 7011 /* pairs with css_put in uncharge_batch */ 7012 css_get(&ug->memcg->css); 7013 } 7014 7015 nr_pages = compound_nr(page); 7016 ug->nr_pages += nr_pages; 7017 7018 if (!PageKmemcg(page)) { 7019 ug->pgpgout++; 7020 } else { 7021 ug->nr_kmem += nr_pages; 7022 __ClearPageKmemcg(page); 7023 } 7024 7025 ug->dummy_page = page; 7026 page->mem_cgroup = NULL; 7027 css_put(&ug->memcg->css); 7028} 7029 7030static void uncharge_list(struct list_head *page_list) 7031{ 7032 struct uncharge_gather ug; 7033 struct list_head *next; 7034 7035 uncharge_gather_clear(&ug); 7036 7037 /* 7038 * Note that the list can be a single page->lru; hence the 7039 * do-while loop instead of a simple list_for_each_entry(). 7040 */ 7041 next = page_list->next; 7042 do { 7043 struct page *page; 7044 7045 page = list_entry(next, struct page, lru); 7046 next = page->lru.next; 7047 7048 uncharge_page(page, &ug); 7049 } while (next != page_list); 7050 7051 if (ug.memcg) 7052 uncharge_batch(&ug); 7053} 7054 7055/** 7056 * mem_cgroup_uncharge - uncharge a page 7057 * @page: page to uncharge 7058 * 7059 * Uncharge a page previously charged with mem_cgroup_charge(). 7060 */ 7061void mem_cgroup_uncharge(struct page *page) 7062{ 7063 struct uncharge_gather ug; 7064 7065 if (mem_cgroup_disabled()) 7066 return; 7067 7068 /* Don't touch page->lru of any random page, pre-check: */ 7069 if (!page->mem_cgroup) 7070 return; 7071 7072 uncharge_gather_clear(&ug); 7073 uncharge_page(page, &ug); 7074 uncharge_batch(&ug); 7075} 7076 7077/** 7078 * mem_cgroup_uncharge_list - uncharge a list of page 7079 * @page_list: list of pages to uncharge 7080 * 7081 * Uncharge a list of pages previously charged with 7082 * mem_cgroup_charge(). 7083 */ 7084void mem_cgroup_uncharge_list(struct list_head *page_list) 7085{ 7086 if (mem_cgroup_disabled()) 7087 return; 7088 7089 if (!list_empty(page_list)) 7090 uncharge_list(page_list); 7091} 7092 7093/** 7094 * mem_cgroup_migrate - charge a page's replacement 7095 * @oldpage: currently circulating page 7096 * @newpage: replacement page 7097 * 7098 * Charge @newpage as a replacement page for @oldpage. @oldpage will 7099 * be uncharged upon free. 7100 * 7101 * Both pages must be locked, @newpage->mapping must be set up. 7102 */ 7103void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 7104{ 7105 struct mem_cgroup *memcg; 7106 unsigned int nr_pages; 7107 unsigned long flags; 7108 7109 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 7110 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 7111 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 7112 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 7113 newpage); 7114 7115 if (mem_cgroup_disabled()) 7116 return; 7117 7118 /* Page cache replacement: new page already charged? */ 7119 if (newpage->mem_cgroup) 7120 return; 7121 7122 /* Swapcache readahead pages can get replaced before being charged */ 7123 memcg = oldpage->mem_cgroup; 7124 if (!memcg) 7125 return; 7126 7127 /* Force-charge the new page. The old one will be freed soon */ 7128 nr_pages = thp_nr_pages(newpage); 7129 7130 page_counter_charge(&memcg->memory, nr_pages); 7131 if (do_memsw_account()) 7132 page_counter_charge(&memcg->memsw, nr_pages); 7133 7134 css_get(&memcg->css); 7135 commit_charge(newpage, memcg); 7136 7137 local_irq_save(flags); 7138 mem_cgroup_charge_statistics(memcg, newpage, nr_pages); 7139 memcg_check_events(memcg, newpage); 7140 local_irq_restore(flags); 7141} 7142 7143DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 7144EXPORT_SYMBOL(memcg_sockets_enabled_key); 7145 7146void mem_cgroup_sk_alloc(struct sock *sk) 7147{ 7148 struct mem_cgroup *memcg; 7149 7150 if (!mem_cgroup_sockets_enabled) 7151 return; 7152 7153 /* Do not associate the sock with unrelated interrupted task's memcg. */ 7154 if (in_interrupt()) 7155 return; 7156 7157 rcu_read_lock(); 7158 memcg = mem_cgroup_from_task(current); 7159 if (memcg == root_mem_cgroup) 7160 goto out; 7161 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 7162 goto out; 7163 if (css_tryget(&memcg->css)) 7164 sk->sk_memcg = memcg; 7165out: 7166 rcu_read_unlock(); 7167} 7168 7169void mem_cgroup_sk_free(struct sock *sk) 7170{ 7171 if (sk->sk_memcg) 7172 css_put(&sk->sk_memcg->css); 7173} 7174 7175/** 7176 * mem_cgroup_charge_skmem - charge socket memory 7177 * @memcg: memcg to charge 7178 * @nr_pages: number of pages to charge 7179 * 7180 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 7181 * @memcg's configured limit, %false if the charge had to be forced. 7182 */ 7183bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7184{ 7185 gfp_t gfp_mask = GFP_KERNEL; 7186 7187 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7188 struct page_counter *fail; 7189 7190 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 7191 memcg->tcpmem_pressure = 0; 7192 return true; 7193 } 7194 page_counter_charge(&memcg->tcpmem, nr_pages); 7195 memcg->tcpmem_pressure = 1; 7196 return false; 7197 } 7198 7199 /* Don't block in the packet receive path */ 7200 if (in_softirq()) 7201 gfp_mask = GFP_NOWAIT; 7202 7203 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7204 7205 if (try_charge(memcg, gfp_mask, nr_pages) == 0) 7206 return true; 7207 7208 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 7209 return false; 7210} 7211 7212/** 7213 * mem_cgroup_uncharge_skmem - uncharge socket memory 7214 * @memcg: memcg to uncharge 7215 * @nr_pages: number of pages to uncharge 7216 */ 7217void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7218{ 7219 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7220 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7221 return; 7222 } 7223 7224 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7225 7226 refill_stock(memcg, nr_pages); 7227} 7228 7229static int __init cgroup_memory(char *s) 7230{ 7231 char *token; 7232 7233 while ((token = strsep(&s, ",")) != NULL) { 7234 if (!*token) 7235 continue; 7236 if (!strcmp(token, "nosocket")) 7237 cgroup_memory_nosocket = true; 7238 if (!strcmp(token, "nokmem")) 7239 cgroup_memory_nokmem = true; 7240 else if (!strcmp(token, "kmem")) 7241 cgroup_memory_nokmem = false; 7242 } 7243 return 1; 7244} 7245__setup("cgroup.memory=", cgroup_memory); 7246 7247/* 7248 * subsys_initcall() for memory controller. 7249 * 7250 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7251 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7252 * basically everything that doesn't depend on a specific mem_cgroup structure 7253 * should be initialized from here. 7254 */ 7255static int __init mem_cgroup_init(void) 7256{ 7257 int cpu, node; 7258 7259 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7260 memcg_hotplug_cpu_dead); 7261 7262 for_each_possible_cpu(cpu) 7263 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7264 drain_local_stock); 7265 7266 for_each_node(node) { 7267 struct mem_cgroup_tree_per_node *rtpn; 7268 7269 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 7270 node_online(node) ? node : NUMA_NO_NODE); 7271 7272 rtpn->rb_root = RB_ROOT; 7273 rtpn->rb_rightmost = NULL; 7274 spin_lock_init(&rtpn->lock); 7275 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7276 } 7277 7278 return 0; 7279} 7280subsys_initcall(mem_cgroup_init); 7281 7282#ifdef CONFIG_MEMCG_SWAP 7283static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7284{ 7285 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7286 /* 7287 * The root cgroup cannot be destroyed, so it's refcount must 7288 * always be >= 1. 7289 */ 7290 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 7291 VM_BUG_ON(1); 7292 break; 7293 } 7294 memcg = parent_mem_cgroup(memcg); 7295 if (!memcg) 7296 memcg = root_mem_cgroup; 7297 } 7298 return memcg; 7299} 7300 7301/** 7302 * mem_cgroup_swapout - transfer a memsw charge to swap 7303 * @page: page whose memsw charge to transfer 7304 * @entry: swap entry to move the charge to 7305 * 7306 * Transfer the memsw charge of @page to @entry. 7307 */ 7308void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 7309{ 7310 struct mem_cgroup *memcg, *swap_memcg; 7311 unsigned int nr_entries; 7312 unsigned short oldid; 7313 7314 VM_BUG_ON_PAGE(PageLRU(page), page); 7315 VM_BUG_ON_PAGE(page_count(page), page); 7316 7317 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7318 return; 7319 7320 memcg = page->mem_cgroup; 7321 7322 /* Readahead page, never charged */ 7323 if (!memcg) 7324 return; 7325 7326 /* 7327 * In case the memcg owning these pages has been offlined and doesn't 7328 * have an ID allocated to it anymore, charge the closest online 7329 * ancestor for the swap instead and transfer the memory+swap charge. 7330 */ 7331 swap_memcg = mem_cgroup_id_get_online(memcg); 7332 nr_entries = thp_nr_pages(page); 7333 /* Get references for the tail pages, too */ 7334 if (nr_entries > 1) 7335 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7336 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7337 nr_entries); 7338 VM_BUG_ON_PAGE(oldid, page); 7339 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7340 7341 page->mem_cgroup = NULL; 7342 7343 if (!mem_cgroup_is_root(memcg)) 7344 page_counter_uncharge(&memcg->memory, nr_entries); 7345 7346 if (!cgroup_memory_noswap && memcg != swap_memcg) { 7347 if (!mem_cgroup_is_root(swap_memcg)) 7348 page_counter_charge(&swap_memcg->memsw, nr_entries); 7349 page_counter_uncharge(&memcg->memsw, nr_entries); 7350 } 7351 7352 /* 7353 * Interrupts should be disabled here because the caller holds the 7354 * i_pages lock which is taken with interrupts-off. It is 7355 * important here to have the interrupts disabled because it is the 7356 * only synchronisation we have for updating the per-CPU variables. 7357 */ 7358 VM_BUG_ON(!irqs_disabled()); 7359 mem_cgroup_charge_statistics(memcg, page, -nr_entries); 7360 memcg_check_events(memcg, page); 7361 7362 css_put(&memcg->css); 7363} 7364 7365/** 7366 * mem_cgroup_try_charge_swap - try charging swap space for a page 7367 * @page: page being added to swap 7368 * @entry: swap entry to charge 7369 * 7370 * Try to charge @page's memcg for the swap space at @entry. 7371 * 7372 * Returns 0 on success, -ENOMEM on failure. 7373 */ 7374int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 7375{ 7376 unsigned int nr_pages = thp_nr_pages(page); 7377 struct page_counter *counter; 7378 struct mem_cgroup *memcg; 7379 unsigned short oldid; 7380 7381 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7382 return 0; 7383 7384 memcg = page->mem_cgroup; 7385 7386 /* Readahead page, never charged */ 7387 if (!memcg) 7388 return 0; 7389 7390 if (!entry.val) { 7391 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7392 return 0; 7393 } 7394 7395 memcg = mem_cgroup_id_get_online(memcg); 7396 7397 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && 7398 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7399 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7400 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7401 mem_cgroup_id_put(memcg); 7402 return -ENOMEM; 7403 } 7404 7405 /* Get references for the tail pages, too */ 7406 if (nr_pages > 1) 7407 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7408 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7409 VM_BUG_ON_PAGE(oldid, page); 7410 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7411 7412 return 0; 7413} 7414 7415/** 7416 * mem_cgroup_uncharge_swap - uncharge swap space 7417 * @entry: swap entry to uncharge 7418 * @nr_pages: the amount of swap space to uncharge 7419 */ 7420void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7421{ 7422 struct mem_cgroup *memcg; 7423 unsigned short id; 7424 7425 id = swap_cgroup_record(entry, 0, nr_pages); 7426 rcu_read_lock(); 7427 memcg = mem_cgroup_from_id(id); 7428 if (memcg) { 7429 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { 7430 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7431 page_counter_uncharge(&memcg->swap, nr_pages); 7432 else 7433 page_counter_uncharge(&memcg->memsw, nr_pages); 7434 } 7435 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7436 mem_cgroup_id_put_many(memcg, nr_pages); 7437 } 7438 rcu_read_unlock(); 7439} 7440 7441long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7442{ 7443 long nr_swap_pages = get_nr_swap_pages(); 7444 7445 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7446 return nr_swap_pages; 7447 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 7448 nr_swap_pages = min_t(long, nr_swap_pages, 7449 READ_ONCE(memcg->swap.max) - 7450 page_counter_read(&memcg->swap)); 7451 return nr_swap_pages; 7452} 7453 7454bool mem_cgroup_swap_full(struct page *page) 7455{ 7456 struct mem_cgroup *memcg; 7457 7458 VM_BUG_ON_PAGE(!PageLocked(page), page); 7459 7460 if (vm_swap_full()) 7461 return true; 7462 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7463 return false; 7464 7465 memcg = page->mem_cgroup; 7466 if (!memcg) 7467 return false; 7468 7469 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 7470 unsigned long usage = page_counter_read(&memcg->swap); 7471 7472 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7473 usage * 2 >= READ_ONCE(memcg->swap.max)) 7474 return true; 7475 } 7476 7477 return false; 7478} 7479 7480static int __init setup_swap_account(char *s) 7481{ 7482 if (!strcmp(s, "1")) 7483 cgroup_memory_noswap = 0; 7484 else if (!strcmp(s, "0")) 7485 cgroup_memory_noswap = 1; 7486 return 1; 7487} 7488__setup("swapaccount=", setup_swap_account); 7489 7490static u64 swap_current_read(struct cgroup_subsys_state *css, 7491 struct cftype *cft) 7492{ 7493 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7494 7495 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7496} 7497 7498static int swap_high_show(struct seq_file *m, void *v) 7499{ 7500 return seq_puts_memcg_tunable(m, 7501 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7502} 7503 7504static ssize_t swap_high_write(struct kernfs_open_file *of, 7505 char *buf, size_t nbytes, loff_t off) 7506{ 7507 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7508 unsigned long high; 7509 int err; 7510 7511 buf = strstrip(buf); 7512 err = page_counter_memparse(buf, "max", &high); 7513 if (err) 7514 return err; 7515 7516 page_counter_set_high(&memcg->swap, high); 7517 7518 return nbytes; 7519} 7520 7521static int swap_max_show(struct seq_file *m, void *v) 7522{ 7523 return seq_puts_memcg_tunable(m, 7524 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7525} 7526 7527static ssize_t swap_max_write(struct kernfs_open_file *of, 7528 char *buf, size_t nbytes, loff_t off) 7529{ 7530 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7531 unsigned long max; 7532 int err; 7533 7534 buf = strstrip(buf); 7535 err = page_counter_memparse(buf, "max", &max); 7536 if (err) 7537 return err; 7538 7539 xchg(&memcg->swap.max, max); 7540 7541 return nbytes; 7542} 7543 7544static int swap_events_show(struct seq_file *m, void *v) 7545{ 7546 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7547 7548 seq_printf(m, "high %lu\n", 7549 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7550 seq_printf(m, "max %lu\n", 7551 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7552 seq_printf(m, "fail %lu\n", 7553 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7554 7555 return 0; 7556} 7557 7558static struct cftype swap_files[] = { 7559 { 7560 .name = "swap.current", 7561 .flags = CFTYPE_NOT_ON_ROOT, 7562 .read_u64 = swap_current_read, 7563 }, 7564 { 7565 .name = "swap.high", 7566 .flags = CFTYPE_NOT_ON_ROOT, 7567 .seq_show = swap_high_show, 7568 .write = swap_high_write, 7569 }, 7570 { 7571 .name = "swap.max", 7572 .flags = CFTYPE_NOT_ON_ROOT, 7573 .seq_show = swap_max_show, 7574 .write = swap_max_write, 7575 }, 7576 { 7577 .name = "swap.events", 7578 .flags = CFTYPE_NOT_ON_ROOT, 7579 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7580 .seq_show = swap_events_show, 7581 }, 7582 { } /* terminate */ 7583}; 7584 7585static struct cftype memsw_files[] = { 7586 { 7587 .name = "memsw.usage_in_bytes", 7588 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7589 .read_u64 = mem_cgroup_read_u64, 7590 }, 7591 { 7592 .name = "memsw.max_usage_in_bytes", 7593 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7594 .write = mem_cgroup_reset, 7595 .read_u64 = mem_cgroup_read_u64, 7596 }, 7597 { 7598 .name = "memsw.limit_in_bytes", 7599 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7600 .write = mem_cgroup_write, 7601 .read_u64 = mem_cgroup_read_u64, 7602 }, 7603 { 7604 .name = "memsw.failcnt", 7605 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7606 .write = mem_cgroup_reset, 7607 .read_u64 = mem_cgroup_read_u64, 7608 }, 7609 { }, /* terminate */ 7610}; 7611 7612/* 7613 * If mem_cgroup_swap_init() is implemented as a subsys_initcall() 7614 * instead of a core_initcall(), this could mean cgroup_memory_noswap still 7615 * remains set to false even when memcg is disabled via "cgroup_disable=memory" 7616 * boot parameter. This may result in premature OOPS inside 7617 * mem_cgroup_get_nr_swap_pages() function in corner cases. 7618 */ 7619static int __init mem_cgroup_swap_init(void) 7620{ 7621 /* No memory control -> no swap control */ 7622 if (mem_cgroup_disabled()) 7623 cgroup_memory_noswap = true; 7624 7625 if (cgroup_memory_noswap) 7626 return 0; 7627 7628 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7629 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7630 7631 return 0; 7632} 7633core_initcall(mem_cgroup_swap_init); 7634 7635#endif /* CONFIG_MEMCG_SWAP */ 7636