1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * mm/memcg_reclaim.c
4  *
5  * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6  */
7 #include <linux/mm.h>
8 #include <linux/backing-dev.h>
9 #include <linux/blkdev.h>
10 #include <linux/hyperhold_inf.h>
11 #include <linux/memcontrol.h>
12 
13 #ifdef CONFIG_HYPERHOLD_FILE_LRU
14 #include <linux/memcg_policy.h>
15 #include "internal.h"
16 #endif
17 
is_swap_not_allowed(struct scan_control *sc, int swappiness)18 static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness)
19 {
20 	return !sc->may_swap || !swappiness || !get_nr_swap_pages();
21 }
22 
23 /*
24  * From 0 .. 100.  Higher means more swappy.
25  */
26 #define HYPERHOLD_SWAPPINESS 100
27 
get_hyperhold_swappiness(void)28 static int get_hyperhold_swappiness(void)
29 {
30 	return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness;
31 }
32 
get_scan_count_hyperhold(struct pglist_data *pgdat, struct scan_control *sc, unsigned long *nr, unsigned long *lru_pages)33 static void get_scan_count_hyperhold(struct pglist_data *pgdat,
34 		struct scan_control *sc, unsigned long *nr,
35 		unsigned long *lru_pages)
36 {
37 	int swappiness = get_hyperhold_swappiness();
38 	struct lruvec *lruvec = node_lruvec(pgdat);
39 	u64 fraction[2];
40 	u64 denominator;
41 	enum scan_balance scan_balance;
42 	unsigned long ap, fp;
43 	enum lru_list lru;
44 	unsigned long pgdatfile;
45 	unsigned long pgdatfree;
46 	int z;
47 	unsigned long anon_cost, file_cost, total_cost;
48 	unsigned long total_high_wmark = 0;
49 
50 
51 	if (cgroup_reclaim(sc) && !swappiness) {
52 		scan_balance = SCAN_FILE;
53 		goto out;
54 	}
55 
56 	/*
57 	 * Do not apply any pressure balancing cleverness when the
58 	 * system is close to OOM, scan both anon and file equally
59 	 * (unless the swappiness setting disagrees with swapping).
60 	 */
61 	if (!sc->priority && swappiness) {
62 		scan_balance = SCAN_EQUAL;
63 		goto out;
64 	}
65 
66 	if (!cgroup_reclaim(sc)) {
67 		pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
68 		pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
69 			node_page_state(pgdat, NR_INACTIVE_FILE);
70 
71 		for (z = 0; z < MAX_NR_ZONES; z++) {
72 			struct zone *zone = &pgdat->node_zones[z];
73 
74 			if (!managed_zone(zone))
75 				continue;
76 
77 			total_high_wmark += high_wmark_pages(zone);
78 		}
79 
80 		if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
81 			/*
82 			 * Force SCAN_ANON if there are enough inactive
83 			 * anonymous pages on the LRU in eligible zones.
84 			 * Otherwise, the small LRU gets thrashed.
85 			 */
86 			if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) &&
87 				(lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
88 					sc->reclaim_idx) >>
89 					(unsigned int)sc->priority)) {
90 				scan_balance = SCAN_ANON;
91 				goto out;
92 			}
93 		}
94 	}
95 
96 	/*
97 	 * If there is enough inactive page cache, i.e. if the size of the
98 	 * inactive list is greater than that of the active list *and* the
99 	 * inactive list actually has some pages to scan on this priority, we
100 	 * do not reclaim anything from the anonymous working set right now.
101 	 * Without the second condition we could end up never scanning an
102 	 * lruvec even if it has plenty of old anonymous pages unless the
103 	 * system is under heavy pressure.
104 	 */
105 
106 	if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) &&
107 	    !inactive_is_low(lruvec, LRU_INACTIVE_FILE) &&
108 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
109 		scan_balance = SCAN_FILE;
110 		goto out;
111 	}
112 
113 	scan_balance = SCAN_FRACT;
114 
115 	/*
116 	 * Calculate the pressure balance between anon and file pages.
117 	 *
118 	 * The amount of pressure we put on each LRU is inversely
119 	 * proportional to the cost of reclaiming each list, as
120 	 * determined by the share of pages that are refaulting, times
121 	 * the relative IO cost of bringing back a swapped out
122 	 * anonymous page vs reloading a filesystem page (swappiness).
123 	 *
124 	 * Although we limit that influence to ensure no list gets
125 	 * left behind completely: at least a third of the pressure is
126 	 * applied, before swappiness.
127 	 *
128 	 * With swappiness at 100, anon and file have equal IO cost.
129 	 */
130 	total_cost = sc->anon_cost + sc->file_cost;
131 	anon_cost = total_cost + sc->anon_cost;
132 	file_cost = total_cost + sc->file_cost;
133 	total_cost = anon_cost + file_cost;
134 
135 	ap = swappiness * (total_cost + 1);
136 	ap /= anon_cost + 1;
137 
138 	fp = (200 - swappiness) * (total_cost + 1);
139 	fp /= file_cost + 1;
140 
141 	fraction[0] = ap;
142 	fraction[1] = fp;
143 	denominator = ap + fp;
144 
145 out:
146 	*lru_pages = 0;
147 	for_each_evictable_lru(lru) {
148 		int file = is_file_lru(lru);
149 		unsigned long lruvec_size;
150 		unsigned long scan;
151 
152 		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
153 		scan = lruvec_size;
154 		*lru_pages += scan;
155 		scan >>= sc->priority;
156 
157 		switch (scan_balance) {
158 		case SCAN_EQUAL:
159 			/* Scan lists relative to size */
160 			break;
161 		case SCAN_FRACT:
162 			/*
163 			 * Scan types proportional to swappiness and
164 			 * their relative recent reclaim efficiency.
165 			 * Make sure we don't miss the last page on
166 			 * the offlined memory cgroups because of a
167 			 * round-off error.
168 			 */
169 			scan = DIV64_U64_ROUND_UP(scan * fraction[file],
170 						  denominator);
171 			break;
172 		case SCAN_FILE:
173 		case SCAN_ANON:
174 			/* Scan one type exclusively */
175 			if ((scan_balance == SCAN_FILE) != file)
176 				scan = 0;
177 			break;
178 		default:
179 			/* Look ma, no brain */
180 			BUG();
181 		}
182 
183 		nr[lru] = scan;
184 	}
185 }
186 
187 #define ISOLATE_LIMIT_CNT 5
shrink_anon_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr)188 void shrink_anon_memcg(struct pglist_data *pgdat,
189 		struct mem_cgroup *memcg, struct scan_control *sc,
190 		unsigned long *nr)
191 {
192 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
193 	unsigned long nr_to_scan;
194 	enum lru_list lru;
195 	unsigned long nr_reclaimed = 0;
196 	struct blk_plug plug;
197 
198 	blk_start_plug(&plug);
199 
200 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
201 		for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
202 			if (nr[lru]) {
203 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
204 				nr[lru] -= nr_to_scan;
205 				nr_reclaimed +=
206 					shrink_list(lru, nr_to_scan,
207 							lruvec, sc);
208 			}
209 		}
210 		if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
211 				(sc->isolate_count > ISOLATE_LIMIT_CNT &&
212 				sc->invoker == DIRECT_RECLAIM))
213 			break;
214 	}
215 	blk_finish_plug(&plug);
216 	sc->nr_reclaimed += nr_reclaimed;
217 	sc->nr_reclaimed_anon += nr_reclaimed;
218 }
219 
memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg)220 static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg)
221 {
222 	if (tmcg == NULL)
223 		return true;
224 
225 	while (!mem_cgroup_is_root(mcg)) {
226 		if (mcg == tmcg)
227 			break;
228 
229 		mcg = parent_mem_cgroup(mcg);
230 	}
231 
232 	return (mcg == tmcg);
233 }
234 
shrink_anon(struct pglist_data *pgdat, struct scan_control *sc, unsigned long *nr)235 static void shrink_anon(struct pglist_data *pgdat,
236 		struct scan_control *sc, unsigned long *nr)
237 {
238 	unsigned long reclaimed;
239 	unsigned long scanned;
240 	struct mem_cgroup *memcg = NULL;
241 	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
242 	unsigned long nr_memcg[NR_LRU_LISTS];
243 	unsigned long nr_node_active = lruvec_lru_size(
244 			node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES);
245 	unsigned long nr_node_inactive = lruvec_lru_size(
246 			node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES);
247 
248 	while ((memcg = get_next_memcg(memcg))) {
249 		struct lruvec *lruvec = NULL;
250 
251 		if (!memcg_is_child_of(memcg, target_memcg))
252 			continue;
253 
254 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
255 
256 		reclaimed = sc->nr_reclaimed;
257 		scanned = sc->nr_scanned;
258 
259 		nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] *
260 			lruvec_lru_size(lruvec, LRU_ACTIVE_ANON,
261 					MAX_NR_ZONES) / (nr_node_active + 1);
262 		nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] *
263 			lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
264 					MAX_NR_ZONES) / (nr_node_inactive + 1);
265 		nr_memcg[LRU_ACTIVE_FILE] = 0;
266 		nr_memcg[LRU_INACTIVE_FILE] = 0;
267 
268 		/*
269 		 * This loop can become CPU-bound when target memcgs
270 		 * aren't eligible for reclaim - either because they
271 		 * don't have any reclaimable pages, or because their
272 		 * memory is explicitly protected. Avoid soft lockups.
273 		 */
274 		cond_resched();
275 
276 		mem_cgroup_calculate_protection(target_memcg, memcg);
277 
278 		if (mem_cgroup_below_min(target_memcg, memcg)) {
279 			/*
280 			 * Hard protection.
281 			 * If there is no reclaimable memory, OOM.
282 			 */
283 			continue;
284 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
285 			/*
286 			 * Soft protection.
287 			 * Respect the protection only as long as
288 			 * there is an unprotected supply
289 			 * of reclaimable memory from other cgroups.
290 			 */
291 			if (!sc->memcg_low_reclaim) {
292 				sc->memcg_low_skipped = 1;
293 				continue;
294 			}
295 			memcg_memory_event(memcg, MEMCG_LOW);
296 		}
297 
298 		shrink_anon_memcg(pgdat, memcg, sc, nr_memcg);
299 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
300 					sc->priority);
301 
302 		vmpressure(sc->gfp_mask, memcg, false,
303 				sc->nr_scanned - scanned,
304 				sc->nr_reclaimed - reclaimed);
305 
306 		if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
307 			(sc->isolate_count > ISOLATE_LIMIT_CNT &&
308 			sc->invoker == DIRECT_RECLAIM)) {
309 			get_next_memcg_break(memcg);
310 			break;
311 		}
312 	}
313 }
314 
shrink_file(struct pglist_data *pgdat, struct scan_control *sc, unsigned long *nr)315 static void shrink_file(struct pglist_data *pgdat,
316 		struct scan_control *sc, unsigned long *nr)
317 {
318 	struct lruvec *lruvec = node_lruvec(pgdat);
319 	unsigned long nr_to_scan;
320 	enum lru_list lru;
321 	unsigned long nr_reclaimed = 0;
322 	struct blk_plug plug;
323 
324 	blk_start_plug(&plug);
325 
326 	while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
327 		for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) {
328 			if (nr[lru]) {
329 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
330 				nr[lru] -= nr_to_scan;
331 				nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, sc);
332 			}
333 		}
334 	}
335 	blk_finish_plug(&plug);
336 	sc->nr_reclaimed += nr_reclaimed;
337 	sc->nr_reclaimed_file += nr_reclaimed;
338 }
339 
shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc)340 bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc)
341 {
342 	unsigned long nr_reclaimed;
343 	struct lruvec *target_lruvec;
344 	bool reclaimable = false;
345 	unsigned long file;
346 
347 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
348 	do {
349 		/* Get scan count for file and anon */
350 		unsigned long node_lru_pages = 0;
351 		unsigned long nr[NR_LRU_LISTS] = {0};
352 
353 		memset(&sc->nr, 0, sizeof(sc->nr));
354 		nr_reclaimed = sc->nr_reclaimed;
355 
356 		/*
357 		 * Determine the scan balance between anon and file LRUs.
358 		 */
359 		spin_lock_irq(&target_lruvec->lru_lock);
360 		sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost;
361 		sc->file_cost = node_lruvec(pgdat)->file_cost;
362 		spin_unlock_irq(&target_lruvec->lru_lock);
363 
364 		/*
365 		 * Target desirable inactive:active list ratios for the anon
366 		 * and file LRU lists.
367 		 */
368 		if (!sc->force_deactivate) {
369 			unsigned long refaults;
370 
371 			refaults = lruvec_page_state(target_lruvec,
372 					WORKINGSET_ACTIVATE_ANON);
373 			if (refaults != target_lruvec->refaults[0] ||
374 					inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
375 				sc->may_deactivate |= DEACTIVATE_ANON;
376 			else
377 				sc->may_deactivate &= ~DEACTIVATE_ANON;
378 
379 			/*
380 			 * When refaults are being observed, it means a new
381 			 * workingset is being established. Deactivate to get
382 			 * rid of any stale active pages quickly.
383 			 */
384 #ifdef CONFIG_HYPERHOLD_FILE_LRU
385 			refaults = lruvec_page_state(node_lruvec(pgdat),
386 					WORKINGSET_ACTIVATE_FILE);
387 			if (refaults != node_lruvec(pgdat)->refaults[1] ||
388 					inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE))
389 				sc->may_deactivate |= DEACTIVATE_FILE;
390 #else
391 			refaults = lruvec_page_state(target_lruvec,
392 					WORKINGSET_ACTIVATE_FILE);
393 			if (refaults != target_lruvec->refaults[1] ||
394 					inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
395 				sc->may_deactivate |= DEACTIVATE_FILE;
396 #endif
397 			else
398 				sc->may_deactivate &= ~DEACTIVATE_FILE;
399 		} else
400 			sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
401 
402 		/*
403 		 * If we have plenty of inactive file pages that aren't
404 		 * thrashing, try to reclaim those first before touching
405 		 * anonymous pages.
406 		 */
407 #ifdef CONFIG_HYPERHOLD_FILE_LRU
408 		file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE);
409 #else
410 		file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
411 #endif
412 		if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
413 			sc->cache_trim_mode = 1;
414 		else
415 			sc->cache_trim_mode = 0;
416 
417 		/*
418 		 * Prevent the reclaimer from falling into the cache trap: as
419 		 * cache pages start out inactive, every cache fault will tip
420 		 * the scan balance towards the file LRU.  And as the file LRU
421 		 * shrinks, so does the window for rotation from references.
422 		 * This means we have a runaway feedback loop where a tiny
423 		 * thrashing file LRU becomes infinitely more attractive than
424 		 * anon pages.  Try to detect this based on file LRU size.
425 		 */
426 		if (!cgroup_reclaim(sc)) {
427 			unsigned long total_high_wmark = 0;
428 			unsigned long free, anon;
429 			int z;
430 
431 			free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
432 			file = node_page_state(pgdat, NR_ACTIVE_FILE) +
433 				node_page_state(pgdat, NR_INACTIVE_FILE);
434 
435 			for (z = 0; z < MAX_NR_ZONES; z++) {
436 				struct zone *zone = &pgdat->node_zones[z];
437 
438 				if (!managed_zone(zone))
439 					continue;
440 
441 				total_high_wmark += high_wmark_pages(zone);
442 			}
443 
444 			/*
445 			 * Consider anon: if that's low too, this isn't a
446 			 * runaway file reclaim problem, but rather just
447 			 * extreme pressure. Reclaim as per usual then.
448 			 */
449 			anon = node_page_state(pgdat, NR_INACTIVE_ANON);
450 
451 			sc->file_is_tiny =
452 				file + free <= total_high_wmark &&
453 				!(sc->may_deactivate & DEACTIVATE_ANON) &&
454 				anon >> sc->priority;
455 		}
456 
457 		get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages);
458 
459 		if (!cgroup_reclaim(sc)) {
460 			/* Shrink the Total-File-LRU */
461 			shrink_file(pgdat, sc, nr);
462 		}
463 
464 		/* Shrink Anon by iterating score_list */
465 		shrink_anon(pgdat, sc, nr);
466 
467 		if (sc->nr_reclaimed - nr_reclaimed)
468 			reclaimable = true;
469 
470 		if (current_is_kswapd()) {
471 			/*
472 			 * If reclaim is isolating dirty pages under writeback,
473 			 * it implies that the long-lived page allocation rate
474 			 * is exceeding the page laundering rate. Either the
475 			 * global limits are not being effective at throttling
476 			 * processes due to the page distribution throughout
477 			 * zones or there is heavy usage of a slow backing
478 			 * device. The only option is to throttle from reclaim
479 			 * context which is not ideal as there is no guarantee
480 			 * the dirtying process is throttled in the same way
481 			 * balance_dirty_pages() manages.
482 			 *
483 			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
484 			 * count the number of pages under pages flagged for
485 			 * immediate reclaim and stall if any are encountered
486 			 * in the nr_immediate check below.
487 			 */
488 			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
489 				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
490 
491 			/* Allow kswapd to start writing pages during reclaim. */
492 			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
493 				set_bit(PGDAT_DIRTY, &pgdat->flags);
494 
495 			/*
496 			 * If kswapd scans pages marked for immediate
497 			 * reclaim and under writeback (nr_immediate), it
498 			 * implies that pages are cycling through the LRU
499 			 * faster than they are written so also forcibly stall.
500 			 */
501 			if (sc->nr.immediate)
502 				reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
503 		}
504 		/*
505 		 * Legacy memcg will stall in page writeback so avoid forcibly
506 		 * stalling in reclaim_throttle().
507 		 */
508 		if ((current_is_kswapd() ||
509 		    (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
510 		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
511 			set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
512 
513 		/*
514 		 * Stall direct reclaim for IO completions if underlying BDIs
515 		 * and node is congested. Allow kswapd to continue until it
516 		 * starts encountering unqueued dirty pages or cycling through
517 		 * the LRU too quickly.
518 		 */
519 		if (!current_is_kswapd() && current_may_throttle() &&
520 		    !sc->hibernation_mode &&
521 		    test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))
522 			reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
523 
524 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
525 					 sc));
526 	/*
527 	 * Kswapd gives up on balancing particular nodes after too
528 	 * many failures to reclaim anything from them and goes to
529 	 * sleep. On reclaim progress, reset the failure counter. A
530 	 * successful direct reclaim run will revive a dormant kswapd.
531 	 */
532 	if (reclaimable)
533 		pgdat->kswapd_failures = 0;
534 
535 	return reclaimable;
536 }
537