1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * mm/memcg_reclaim.c
4 *
5 * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6 */
7 #include <linux/mm.h>
8 #include <linux/backing-dev.h>
9 #include <linux/blkdev.h>
10 #include <linux/hyperhold_inf.h>
11 #include <linux/memcontrol.h>
12
13 #ifdef CONFIG_HYPERHOLD_FILE_LRU
14 #include <linux/memcg_policy.h>
15 #include "internal.h"
16 #endif
17
is_swap_not_allowed(struct scan_control *sc, int swappiness)18 static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness)
19 {
20 return !sc->may_swap || !swappiness || !get_nr_swap_pages();
21 }
22
23 /*
24 * From 0 .. 100. Higher means more swappy.
25 */
26 #define HYPERHOLD_SWAPPINESS 100
27
get_hyperhold_swappiness(void)28 static int get_hyperhold_swappiness(void)
29 {
30 return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness;
31 }
32
get_scan_count_hyperhold(struct pglist_data *pgdat, struct scan_control *sc, unsigned long *nr, unsigned long *lru_pages)33 static void get_scan_count_hyperhold(struct pglist_data *pgdat,
34 struct scan_control *sc, unsigned long *nr,
35 unsigned long *lru_pages)
36 {
37 int swappiness = get_hyperhold_swappiness();
38 struct lruvec *lruvec = node_lruvec(pgdat);
39 u64 fraction[2];
40 u64 denominator;
41 enum scan_balance scan_balance;
42 unsigned long ap, fp;
43 enum lru_list lru;
44 unsigned long pgdatfile;
45 unsigned long pgdatfree;
46 int z;
47 unsigned long anon_cost, file_cost, total_cost;
48 unsigned long total_high_wmark = 0;
49
50
51 if (cgroup_reclaim(sc) && !swappiness) {
52 scan_balance = SCAN_FILE;
53 goto out;
54 }
55
56 /*
57 * Do not apply any pressure balancing cleverness when the
58 * system is close to OOM, scan both anon and file equally
59 * (unless the swappiness setting disagrees with swapping).
60 */
61 if (!sc->priority && swappiness) {
62 scan_balance = SCAN_EQUAL;
63 goto out;
64 }
65
66 if (!cgroup_reclaim(sc)) {
67 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
68 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
69 node_page_state(pgdat, NR_INACTIVE_FILE);
70
71 for (z = 0; z < MAX_NR_ZONES; z++) {
72 struct zone *zone = &pgdat->node_zones[z];
73
74 if (!managed_zone(zone))
75 continue;
76
77 total_high_wmark += high_wmark_pages(zone);
78 }
79
80 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
81 /*
82 * Force SCAN_ANON if there are enough inactive
83 * anonymous pages on the LRU in eligible zones.
84 * Otherwise, the small LRU gets thrashed.
85 */
86 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) &&
87 (lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
88 sc->reclaim_idx) >>
89 (unsigned int)sc->priority)) {
90 scan_balance = SCAN_ANON;
91 goto out;
92 }
93 }
94 }
95
96 /*
97 * If there is enough inactive page cache, i.e. if the size of the
98 * inactive list is greater than that of the active list *and* the
99 * inactive list actually has some pages to scan on this priority, we
100 * do not reclaim anything from the anonymous working set right now.
101 * Without the second condition we could end up never scanning an
102 * lruvec even if it has plenty of old anonymous pages unless the
103 * system is under heavy pressure.
104 */
105
106 if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) &&
107 !inactive_is_low(lruvec, LRU_INACTIVE_FILE) &&
108 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
109 scan_balance = SCAN_FILE;
110 goto out;
111 }
112
113 scan_balance = SCAN_FRACT;
114
115 /*
116 * Calculate the pressure balance between anon and file pages.
117 *
118 * The amount of pressure we put on each LRU is inversely
119 * proportional to the cost of reclaiming each list, as
120 * determined by the share of pages that are refaulting, times
121 * the relative IO cost of bringing back a swapped out
122 * anonymous page vs reloading a filesystem page (swappiness).
123 *
124 * Although we limit that influence to ensure no list gets
125 * left behind completely: at least a third of the pressure is
126 * applied, before swappiness.
127 *
128 * With swappiness at 100, anon and file have equal IO cost.
129 */
130 total_cost = sc->anon_cost + sc->file_cost;
131 anon_cost = total_cost + sc->anon_cost;
132 file_cost = total_cost + sc->file_cost;
133 total_cost = anon_cost + file_cost;
134
135 ap = swappiness * (total_cost + 1);
136 ap /= anon_cost + 1;
137
138 fp = (200 - swappiness) * (total_cost + 1);
139 fp /= file_cost + 1;
140
141 fraction[0] = ap;
142 fraction[1] = fp;
143 denominator = ap + fp;
144
145 out:
146 *lru_pages = 0;
147 for_each_evictable_lru(lru) {
148 int file = is_file_lru(lru);
149 unsigned long lruvec_size;
150 unsigned long scan;
151
152 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
153 scan = lruvec_size;
154 *lru_pages += scan;
155 scan >>= sc->priority;
156
157 switch (scan_balance) {
158 case SCAN_EQUAL:
159 /* Scan lists relative to size */
160 break;
161 case SCAN_FRACT:
162 /*
163 * Scan types proportional to swappiness and
164 * their relative recent reclaim efficiency.
165 * Make sure we don't miss the last page on
166 * the offlined memory cgroups because of a
167 * round-off error.
168 */
169 scan = DIV64_U64_ROUND_UP(scan * fraction[file],
170 denominator);
171 break;
172 case SCAN_FILE:
173 case SCAN_ANON:
174 /* Scan one type exclusively */
175 if ((scan_balance == SCAN_FILE) != file)
176 scan = 0;
177 break;
178 default:
179 /* Look ma, no brain */
180 BUG();
181 }
182
183 nr[lru] = scan;
184 }
185 }
186
187 #define ISOLATE_LIMIT_CNT 5
shrink_anon_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr)188 void shrink_anon_memcg(struct pglist_data *pgdat,
189 struct mem_cgroup *memcg, struct scan_control *sc,
190 unsigned long *nr)
191 {
192 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
193 unsigned long nr_to_scan;
194 enum lru_list lru;
195 unsigned long nr_reclaimed = 0;
196 struct blk_plug plug;
197
198 blk_start_plug(&plug);
199
200 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
201 for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
202 if (nr[lru]) {
203 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
204 nr[lru] -= nr_to_scan;
205 nr_reclaimed +=
206 shrink_list(lru, nr_to_scan,
207 lruvec, sc);
208 }
209 }
210 if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
211 (sc->isolate_count > ISOLATE_LIMIT_CNT &&
212 sc->invoker == DIRECT_RECLAIM))
213 break;
214 }
215 blk_finish_plug(&plug);
216 sc->nr_reclaimed += nr_reclaimed;
217 sc->nr_reclaimed_anon += nr_reclaimed;
218 }
219
memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg)220 static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg)
221 {
222 if (tmcg == NULL)
223 return true;
224
225 while (!mem_cgroup_is_root(mcg)) {
226 if (mcg == tmcg)
227 break;
228
229 mcg = parent_mem_cgroup(mcg);
230 }
231
232 return (mcg == tmcg);
233 }
234
shrink_anon(struct pglist_data *pgdat, struct scan_control *sc, unsigned long *nr)235 static void shrink_anon(struct pglist_data *pgdat,
236 struct scan_control *sc, unsigned long *nr)
237 {
238 unsigned long reclaimed;
239 unsigned long scanned;
240 struct mem_cgroup *memcg = NULL;
241 struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
242 unsigned long nr_memcg[NR_LRU_LISTS];
243 unsigned long nr_node_active = lruvec_lru_size(
244 node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES);
245 unsigned long nr_node_inactive = lruvec_lru_size(
246 node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES);
247
248 while ((memcg = get_next_memcg(memcg))) {
249 struct lruvec *lruvec = NULL;
250
251 if (!memcg_is_child_of(memcg, target_memcg))
252 continue;
253
254 lruvec = mem_cgroup_lruvec(memcg, pgdat);
255
256 reclaimed = sc->nr_reclaimed;
257 scanned = sc->nr_scanned;
258
259 nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] *
260 lruvec_lru_size(lruvec, LRU_ACTIVE_ANON,
261 MAX_NR_ZONES) / (nr_node_active + 1);
262 nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] *
263 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
264 MAX_NR_ZONES) / (nr_node_inactive + 1);
265 nr_memcg[LRU_ACTIVE_FILE] = 0;
266 nr_memcg[LRU_INACTIVE_FILE] = 0;
267
268 /*
269 * This loop can become CPU-bound when target memcgs
270 * aren't eligible for reclaim - either because they
271 * don't have any reclaimable pages, or because their
272 * memory is explicitly protected. Avoid soft lockups.
273 */
274 cond_resched();
275
276 mem_cgroup_calculate_protection(target_memcg, memcg);
277
278 if (mem_cgroup_below_min(target_memcg, memcg)) {
279 /*
280 * Hard protection.
281 * If there is no reclaimable memory, OOM.
282 */
283 continue;
284 } else if (mem_cgroup_below_low(target_memcg, memcg)) {
285 /*
286 * Soft protection.
287 * Respect the protection only as long as
288 * there is an unprotected supply
289 * of reclaimable memory from other cgroups.
290 */
291 if (!sc->memcg_low_reclaim) {
292 sc->memcg_low_skipped = 1;
293 continue;
294 }
295 memcg_memory_event(memcg, MEMCG_LOW);
296 }
297
298 shrink_anon_memcg(pgdat, memcg, sc, nr_memcg);
299 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
300 sc->priority);
301
302 vmpressure(sc->gfp_mask, memcg, false,
303 sc->nr_scanned - scanned,
304 sc->nr_reclaimed - reclaimed);
305
306 if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
307 (sc->isolate_count > ISOLATE_LIMIT_CNT &&
308 sc->invoker == DIRECT_RECLAIM)) {
309 get_next_memcg_break(memcg);
310 break;
311 }
312 }
313 }
314
shrink_file(struct pglist_data *pgdat, struct scan_control *sc, unsigned long *nr)315 static void shrink_file(struct pglist_data *pgdat,
316 struct scan_control *sc, unsigned long *nr)
317 {
318 struct lruvec *lruvec = node_lruvec(pgdat);
319 unsigned long nr_to_scan;
320 enum lru_list lru;
321 unsigned long nr_reclaimed = 0;
322 struct blk_plug plug;
323
324 blk_start_plug(&plug);
325
326 while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
327 for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) {
328 if (nr[lru]) {
329 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
330 nr[lru] -= nr_to_scan;
331 nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, sc);
332 }
333 }
334 }
335 blk_finish_plug(&plug);
336 sc->nr_reclaimed += nr_reclaimed;
337 sc->nr_reclaimed_file += nr_reclaimed;
338 }
339
shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc)340 bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc)
341 {
342 unsigned long nr_reclaimed;
343 struct lruvec *target_lruvec;
344 bool reclaimable = false;
345 unsigned long file;
346
347 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
348 do {
349 /* Get scan count for file and anon */
350 unsigned long node_lru_pages = 0;
351 unsigned long nr[NR_LRU_LISTS] = {0};
352
353 memset(&sc->nr, 0, sizeof(sc->nr));
354 nr_reclaimed = sc->nr_reclaimed;
355
356 /*
357 * Determine the scan balance between anon and file LRUs.
358 */
359 spin_lock_irq(&target_lruvec->lru_lock);
360 sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost;
361 sc->file_cost = node_lruvec(pgdat)->file_cost;
362 spin_unlock_irq(&target_lruvec->lru_lock);
363
364 /*
365 * Target desirable inactive:active list ratios for the anon
366 * and file LRU lists.
367 */
368 if (!sc->force_deactivate) {
369 unsigned long refaults;
370
371 refaults = lruvec_page_state(target_lruvec,
372 WORKINGSET_ACTIVATE_ANON);
373 if (refaults != target_lruvec->refaults[0] ||
374 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
375 sc->may_deactivate |= DEACTIVATE_ANON;
376 else
377 sc->may_deactivate &= ~DEACTIVATE_ANON;
378
379 /*
380 * When refaults are being observed, it means a new
381 * workingset is being established. Deactivate to get
382 * rid of any stale active pages quickly.
383 */
384 #ifdef CONFIG_HYPERHOLD_FILE_LRU
385 refaults = lruvec_page_state(node_lruvec(pgdat),
386 WORKINGSET_ACTIVATE_FILE);
387 if (refaults != node_lruvec(pgdat)->refaults[1] ||
388 inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE))
389 sc->may_deactivate |= DEACTIVATE_FILE;
390 #else
391 refaults = lruvec_page_state(target_lruvec,
392 WORKINGSET_ACTIVATE_FILE);
393 if (refaults != target_lruvec->refaults[1] ||
394 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
395 sc->may_deactivate |= DEACTIVATE_FILE;
396 #endif
397 else
398 sc->may_deactivate &= ~DEACTIVATE_FILE;
399 } else
400 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
401
402 /*
403 * If we have plenty of inactive file pages that aren't
404 * thrashing, try to reclaim those first before touching
405 * anonymous pages.
406 */
407 #ifdef CONFIG_HYPERHOLD_FILE_LRU
408 file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE);
409 #else
410 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
411 #endif
412 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
413 sc->cache_trim_mode = 1;
414 else
415 sc->cache_trim_mode = 0;
416
417 /*
418 * Prevent the reclaimer from falling into the cache trap: as
419 * cache pages start out inactive, every cache fault will tip
420 * the scan balance towards the file LRU. And as the file LRU
421 * shrinks, so does the window for rotation from references.
422 * This means we have a runaway feedback loop where a tiny
423 * thrashing file LRU becomes infinitely more attractive than
424 * anon pages. Try to detect this based on file LRU size.
425 */
426 if (!cgroup_reclaim(sc)) {
427 unsigned long total_high_wmark = 0;
428 unsigned long free, anon;
429 int z;
430
431 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
432 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
433 node_page_state(pgdat, NR_INACTIVE_FILE);
434
435 for (z = 0; z < MAX_NR_ZONES; z++) {
436 struct zone *zone = &pgdat->node_zones[z];
437
438 if (!managed_zone(zone))
439 continue;
440
441 total_high_wmark += high_wmark_pages(zone);
442 }
443
444 /*
445 * Consider anon: if that's low too, this isn't a
446 * runaway file reclaim problem, but rather just
447 * extreme pressure. Reclaim as per usual then.
448 */
449 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
450
451 sc->file_is_tiny =
452 file + free <= total_high_wmark &&
453 !(sc->may_deactivate & DEACTIVATE_ANON) &&
454 anon >> sc->priority;
455 }
456
457 get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages);
458
459 if (!cgroup_reclaim(sc)) {
460 /* Shrink the Total-File-LRU */
461 shrink_file(pgdat, sc, nr);
462 }
463
464 /* Shrink Anon by iterating score_list */
465 shrink_anon(pgdat, sc, nr);
466
467 if (sc->nr_reclaimed - nr_reclaimed)
468 reclaimable = true;
469
470 if (current_is_kswapd()) {
471 /*
472 * If reclaim is isolating dirty pages under writeback,
473 * it implies that the long-lived page allocation rate
474 * is exceeding the page laundering rate. Either the
475 * global limits are not being effective at throttling
476 * processes due to the page distribution throughout
477 * zones or there is heavy usage of a slow backing
478 * device. The only option is to throttle from reclaim
479 * context which is not ideal as there is no guarantee
480 * the dirtying process is throttled in the same way
481 * balance_dirty_pages() manages.
482 *
483 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
484 * count the number of pages under pages flagged for
485 * immediate reclaim and stall if any are encountered
486 * in the nr_immediate check below.
487 */
488 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
489 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
490
491 /* Allow kswapd to start writing pages during reclaim. */
492 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
493 set_bit(PGDAT_DIRTY, &pgdat->flags);
494
495 /*
496 * If kswapd scans pages marked for immediate
497 * reclaim and under writeback (nr_immediate), it
498 * implies that pages are cycling through the LRU
499 * faster than they are written so also forcibly stall.
500 */
501 if (sc->nr.immediate)
502 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
503 }
504 /*
505 * Legacy memcg will stall in page writeback so avoid forcibly
506 * stalling in reclaim_throttle().
507 */
508 if ((current_is_kswapd() ||
509 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
510 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
511 set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
512
513 /*
514 * Stall direct reclaim for IO completions if underlying BDIs
515 * and node is congested. Allow kswapd to continue until it
516 * starts encountering unqueued dirty pages or cycling through
517 * the LRU too quickly.
518 */
519 if (!current_is_kswapd() && current_may_throttle() &&
520 !sc->hibernation_mode &&
521 test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))
522 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
523
524 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
525 sc));
526 /*
527 * Kswapd gives up on balancing particular nodes after too
528 * many failures to reclaim anything from them and goes to
529 * sleep. On reclaim progress, reset the failure counter. A
530 * successful direct reclaim run will revive a dormant kswapd.
531 */
532 if (reclaimable)
533 pgdat->kswapd_failures = 0;
534
535 return reclaimable;
536 }
537