1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * mm/zswapd.c
4  *
5  * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6  */
7 
8 #include <linux/blkdev.h>
9 #include <linux/freezer.h>
10 #include <linux/memcg_policy.h>
11 #include <trace/events/vmscan.h>
12 #include <uapi/linux/sched/types.h>
13 #include <linux/zswapd.h>
14 #ifdef CONFIG_RECLAIM_ACCT
15 #include <linux/reclaim_acct.h>
16 #endif
17 
18 #include "zswapd_internal.h"
19 #include "internal.h"
20 
21 #define UNSET_ZRAM_WM_RATIO 0
22 #define ESWAP_PERCENT_CONSTANT 100
23 #define DEFAULT_ZRAM_WM_RATIO 37
24 #define SWAP_MORE_ZRAM (50 * (SZ_1M))
25 
26 static wait_queue_head_t snapshotd_wait;
27 static atomic_t snapshotd_wait_flag;
28 static atomic_t snapshotd_init_flag = ATOMIC_INIT(0);
29 static struct task_struct *snapshotd_task;
30 
31 static pid_t zswapd_pid = -1;
32 static unsigned long long last_anon_pagefault;
33 static unsigned long long anon_refault_ratio;
34 static unsigned long long zswapd_skip_interval;
35 static unsigned long last_zswapd_time;
36 static unsigned long last_snapshot_time;
37 bool last_round_is_empty;
38 
39 
40 DECLARE_RWSEM(gs_lock);
41 LIST_HEAD(gs_list);
42 
unregister_group_swap(struct group_swap_device *gsdev)43 void unregister_group_swap(struct group_swap_device *gsdev)
44 {
45 	down_write(&gs_lock);
46 	list_del(&gsdev->list);
47 	up_write(&gs_lock);
48 
49 	kfree(gsdev);
50 }
51 EXPORT_SYMBOL(unregister_group_swap);
52 
register_group_swap(struct group_swap_ops *ops, void *priv)53 struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv)
54 {
55 	struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL);
56 
57 	if (!gsdev)
58 		return NULL;
59 
60 	gsdev->priv = priv;
61 	gsdev->ops = ops;
62 
63 	down_write(&gs_lock);
64 	list_add(&gsdev->list, &gs_list);
65 	up_write(&gs_lock);
66 
67 	return gsdev;
68 }
69 EXPORT_SYMBOL(register_group_swap);
70 
memcg_data_size(struct mem_cgroup *memcg, int type)71 u64 memcg_data_size(struct mem_cgroup *memcg, int type)
72 {
73 	struct group_swap_device *gsdev = NULL;
74 	u64 size = 0;
75 
76 	down_read(&gs_lock);
77 	list_for_each_entry(gsdev, &gs_list, list)
78 		size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv);
79 	up_read(&gs_lock);
80 
81 	return size;
82 }
83 
swapin_memcg(struct mem_cgroup *memcg, u64 req_size)84 u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size)
85 {
86 	u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
87 	u64 read_size = 0;
88 	u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio);
89 	struct group_swap_device *gsdev = NULL;
90 
91 	if (req_size > div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT))
92 		req_size = div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT);
93 	down_read(&gs_lock);
94 	list_for_each_entry(gsdev, &gs_list, list) {
95 		read_size += gsdev->ops->group_read(memcg->id.id, req_size - read_size,
96 							gsdev->priv);
97 		if (read_size >= req_size)
98 			break;
99 	}
100 	up_read(&gs_lock);
101 
102 	return read_size;
103 }
104 
swapout_memcg(struct mem_cgroup *memcg, u64 req_size)105 static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size)
106 {
107 	u64 cache_size = memcg_data_size(memcg, CACHE_SIZE);
108 	u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
109 	u64 all_size = cache_size + swap_size;
110 	u64 write_size = 0;
111 	u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio);
112 	struct group_swap_device *gsdev = NULL;
113 
114 	if (div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) <= swap_size)
115 		return 0;
116 	if (req_size > div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size)
117 		req_size = div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size;
118 	down_read(&gs_lock);
119 	list_for_each_entry(gsdev, &gs_list, list) {
120 		write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size,
121 							gsdev->priv);
122 		if (write_size >= req_size)
123 			break;
124 	}
125 	up_read(&gs_lock);
126 
127 	return write_size;
128 }
129 
swapout(u64 req_size)130 static u64 swapout(u64 req_size)
131 {
132 	struct mem_cgroup *memcg = NULL;
133 	u64 write_size = 0;
134 
135 	while ((memcg = get_next_memcg(memcg)) != NULL) {
136 		write_size += swapout_memcg(memcg, req_size - write_size);
137 		if (write_size >= req_size)
138 			break;
139 	}
140 
141 	return write_size;
142 }
143 
get_zram_used_pages(void)144 static unsigned long long get_zram_used_pages(void)
145 {
146 	struct mem_cgroup *memcg = NULL;
147 	unsigned long long zram_pages = 0;
148 
149 	while ((memcg = get_next_memcg(memcg)) != NULL)
150 		zram_pages += memcg_data_size(memcg, CACHE_PAGE);
151 
152 	return zram_pages;
153 }
154 
get_eswap_used_pages(void)155 static unsigned long long get_eswap_used_pages(void)
156 {
157 	struct mem_cgroup *memcg = NULL;
158 	unsigned long long eswap_pages = 0;
159 
160 	while ((memcg = get_next_memcg(memcg)) != NULL)
161 		eswap_pages += memcg_data_size(memcg, SWAP_PAGE);
162 
163 	return eswap_pages;
164 }
165 
get_zram_pagefault(void)166 static unsigned long long get_zram_pagefault(void)
167 {
168 	struct mem_cgroup *memcg = NULL;
169 	unsigned long long cache_fault = 0;
170 
171 	while ((memcg = get_next_memcg(memcg)) != NULL)
172 		cache_fault += memcg_data_size(memcg, CACHE_FAULT);
173 
174 	return cache_fault;
175 }
176 
calc_sys_cur_avail_buffers(void)177 static unsigned int calc_sys_cur_avail_buffers(void)
178 {
179 	const unsigned int percent_constant = 100;
180 	unsigned long freemem;
181 	unsigned long active_file;
182 	unsigned long inactive_file;
183 	unsigned long buffers;
184 
185 	freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K;
186 	active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K;
187 	inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K;
188 
189 	buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant +
190 		active_file * get_active_file_ratio() / percent_constant;
191 
192 	return (buffers * SZ_1K / SZ_1M); /* kb to mb */
193 }
194 
zswapd_status_show(struct seq_file *m)195 void zswapd_status_show(struct seq_file *m)
196 {
197 	unsigned int buffers = calc_sys_cur_avail_buffers();
198 
199 	seq_printf(m, "buffer_size:%u\n", buffers);
200 	seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio);
201 }
202 
get_zswapd_pid(void)203 pid_t get_zswapd_pid(void)
204 {
205 	return zswapd_pid;
206 }
207 
min_buffer_is_suitable(void)208 static bool min_buffer_is_suitable(void)
209 {
210 	unsigned int buffers = calc_sys_cur_avail_buffers();
211 
212 	if (buffers >= get_min_avail_buffers())
213 		return true;
214 
215 	return false;
216 }
217 
buffer_is_suitable(void)218 static bool buffer_is_suitable(void)
219 {
220 	unsigned int buffers = calc_sys_cur_avail_buffers();
221 
222 	if (buffers >= get_avail_buffers())
223 		return true;
224 
225 	return false;
226 }
227 
high_buffer_is_suitable(void)228 static bool high_buffer_is_suitable(void)
229 {
230 	unsigned int buffers = calc_sys_cur_avail_buffers();
231 
232 	if (buffers >= get_high_avail_buffers())
233 		return true;
234 
235 	return false;
236 }
237 
snapshot_anon_refaults(void)238 static void snapshot_anon_refaults(void)
239 {
240 	struct mem_cgroup *memcg = NULL;
241 
242 	while ((memcg = get_next_memcg(memcg)) != NULL)
243 		memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT);
244 
245 	last_anon_pagefault = get_zram_pagefault();
246 	last_snapshot_time = jiffies;
247 }
248 
249 /*
250  * Return true if refault changes between two read operations.
251  */
get_memcg_anon_refault_status(struct mem_cgroup *memcg)252 static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg)
253 {
254 	const unsigned int percent_constant = 100;
255 	unsigned long long anon_pagefault;
256 	unsigned long long anon_total;
257 	unsigned long long ratio;
258 	struct mem_cgroup_per_node *mz = NULL;
259 	struct lruvec *lruvec = NULL;
260 
261 	if (!memcg)
262 		return false;
263 
264 	anon_pagefault = memcg_data_size(memcg, CACHE_FAULT);
265 	if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault)
266 		return false;
267 
268 	mz = mem_cgroup_nodeinfo(memcg, 0);
269 	if (!mz)
270 		return false;
271 
272 	lruvec = &mz->lruvec;
273 	if (!lruvec)
274 		return false;
275 
276 	anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
277 		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) +
278 		memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE);
279 
280 	ratio = div64_u64((anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) *
281 			percent_constant, (anon_total + 1));
282 	if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold))
283 		return true;
284 
285 	return false;
286 }
287 
get_area_anon_refault_status(void)288 static bool get_area_anon_refault_status(void)
289 {
290 	const unsigned int percent_constant = 1000;
291 	unsigned long long anon_pagefault;
292 	unsigned long long ratio;
293 	unsigned long long time;
294 
295 	anon_pagefault = get_zram_pagefault();
296 	time = jiffies;
297 	if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time)
298 		return false;
299 
300 	ratio = div_u64((anon_pagefault - last_anon_pagefault) * percent_constant,
301 			(jiffies_to_msecs(time - last_snapshot_time) + 1));
302 	anon_refault_ratio = ratio;
303 
304 	if (ratio > get_area_anon_refault_threshold())
305 		return true;
306 
307 	return false;
308 }
309 
wakeup_snapshotd(void)310 void wakeup_snapshotd(void)
311 {
312 	unsigned long snapshot_interval;
313 
314 	snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time);
315 	if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) {
316 		atomic_set(&snapshotd_wait_flag, 1);
317 		wake_up_interruptible(&snapshotd_wait);
318 	}
319 }
320 
snapshotd(void *p)321 static int snapshotd(void *p)
322 {
323 	int ret;
324 
325 	while (!kthread_should_stop()) {
326 		ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag));
327 		if (ret)
328 			continue;
329 
330 		atomic_set(&snapshotd_wait_flag, 0);
331 
332 		snapshot_anon_refaults();
333 		count_vm_event(ZSWAPD_SNAPSHOT_TIMES);
334 	}
335 
336 	return 0;
337 }
338 
set_snapshotd_init_flag(unsigned int val)339 void set_snapshotd_init_flag(unsigned int val)
340 {
341 	atomic_set(&snapshotd_init_flag, val);
342 }
343 
344 /*
345  * This snapshotd start function will be called by init.
346  */
snapshotd_run(void)347 int snapshotd_run(void)
348 {
349 	atomic_set(&snapshotd_wait_flag, 0);
350 	init_waitqueue_head(&snapshotd_wait);
351 
352 	snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd");
353 	if (IS_ERR(snapshotd_task)) {
354 		pr_err("Failed to start snapshotd\n");
355 		return PTR_ERR(snapshotd_task);
356 	}
357 
358 	return 0;
359 }
360 
snapshotd_init(void)361 static int __init snapshotd_init(void)
362 {
363 	snapshotd_run();
364 
365 	return 0;
366 }
367 module_init(snapshotd_init);
368 
get_zswapd_eswap_policy(void)369 static int get_zswapd_eswap_policy(void)
370 {
371 	if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO)
372 		return CHECK_BUFFER_ONLY;
373 	else
374 		return CHECK_BUFFER_ZRAMRATIO_BOTH;
375 }
376 
get_policy_zram_wm_ratio(void)377 static unsigned int get_policy_zram_wm_ratio(void)
378 {
379 	enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
380 
381 	if (policy == CHECK_BUFFER_ONLY)
382 		return DEFAULT_ZRAM_WM_RATIO;
383 	else
384 		return get_zram_wm_ratio();
385 }
386 
get_zram_current_watermark(void)387 int get_zram_current_watermark(void)
388 {
389 	long long diff_buffers;
390 	const unsigned int percent_constant = 10;
391 	u64 nr_total;
392 	unsigned int zram_wm_ratio = get_policy_zram_wm_ratio();
393 
394 	nr_total = totalram_pages();
395 	/* B_target - B_current */
396 	diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers();
397 	/* MB to page */
398 	diff_buffers *= SZ_1M / PAGE_SIZE;
399 	/* after_comp to before_comp */
400 	diff_buffers *= get_compress_ratio();
401 	/* page to ratio */
402 	diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total);
403 
404 	return min((long long)zram_wm_ratio, zram_wm_ratio - diff_buffers);
405 }
406 
zram_watermark_ok(void)407 bool zram_watermark_ok(void)
408 {
409 	const unsigned int percent_constant = 100;
410 	u64 nr_zram_used;
411 	u64 nr_wm;
412 	u64 ratio;
413 
414 	ratio = get_zram_current_watermark();
415 	nr_zram_used = get_zram_used_pages();
416 	nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
417 	if (nr_zram_used > nr_wm)
418 		return true;
419 
420 	return false;
421 }
422 
zram_watermark_exceed(void)423 bool zram_watermark_exceed(void)
424 {
425 	u64 nr_zram_used;
426 	const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE);
427 
428 	if (!nr_wm)
429 		return false;
430 
431 	nr_zram_used = get_zram_used_pages();
432 	if (nr_zram_used > nr_wm)
433 		return true;
434 	return false;
435 }
436 
wakeup_zswapd(pg_data_t *pgdat)437 void wakeup_zswapd(pg_data_t *pgdat)
438 {
439 	unsigned long interval;
440 
441 	if (IS_ERR(pgdat->zswapd))
442 		return;
443 
444 	if (!wq_has_sleeper(&pgdat->zswapd_wait))
445 		return;
446 
447 	/*
448 	 * make anon pagefault snapshots
449 	 * wake up snapshotd
450 	 */
451 	if (atomic_read(&snapshotd_init_flag) == 1)
452 		wakeup_snapshotd();
453 
454 	/* wake up when the buffer is lower than min_avail_buffer */
455 	if (min_buffer_is_suitable())
456 		return;
457 
458 	interval = jiffies_to_msecs(jiffies - last_zswapd_time);
459 	if (interval < zswapd_skip_interval) {
460 		count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES);
461 		return;
462 	}
463 
464 	atomic_set(&pgdat->zswapd_wait_flag, 1);
465 	wake_up_interruptible(&pgdat->zswapd_wait);
466 }
467 
wake_all_zswapd(void)468 void wake_all_zswapd(void)
469 {
470 	pg_data_t *pgdat = NULL;
471 	int nid;
472 
473 	for_each_online_node(nid) {
474 		pgdat = NODE_DATA(nid);
475 		wakeup_zswapd(pgdat);
476 	}
477 }
478 
479 #ifdef CONFIG_HYPERHOLD_FILE_LRU
zswapd_shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)480 static void zswapd_shrink_active_list(unsigned long nr_to_scan,
481 	struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
482 {
483 	unsigned int nr_deactivate;
484 	unsigned long nr_scanned;
485 	unsigned long nr_taken;
486 
487 	struct page *page = NULL;
488 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
489 	unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost;
490 	unsigned long *anon_cost = &lruvec->anon_cost;
491 	LIST_HEAD(l_inactive);
492 	LIST_HEAD(l_hold);
493 
494 	lru_add_drain();
495 
496 	spin_lock_irq(&lruvec->lru_lock);
497 	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru);
498 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken);
499 	*anon_cost += nr_taken;
500 	*node_anon_cost += nr_taken;
501 	__count_vm_events(PGREFILL, nr_scanned);
502 	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
503 	spin_unlock_irq(&lruvec->lru_lock);
504 
505 	while (!list_empty(&l_hold)) {
506 		cond_resched();
507 		page = lru_to_page(&l_hold);
508 		list_del(&page->lru);
509 
510 		if (unlikely(!folio_evictable(page_folio(page)))) {
511 			putback_lru_page(page);
512 			continue;
513 		}
514 
515 		ClearPageActive(page);
516 		SetPageWorkingset(page);
517 		list_add(&page->lru, &l_inactive);
518 	}
519 
520 	spin_lock_irq(&lruvec->lru_lock);
521 	nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
522 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken);
523 	spin_unlock_irq(&lruvec->lru_lock);
524 
525 	mem_cgroup_uncharge_list(&l_inactive);
526 	free_unref_page_list(&l_inactive);
527 
528 	trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken,
529 			nr_deactivate, sc->priority);
530 }
531 
zswapd_shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc)532 static unsigned long zswapd_shrink_list(enum lru_list lru,
533 		unsigned long nr_to_scan, struct lruvec *lruvec,
534 		struct scan_control *sc)
535 {
536 #ifdef CONFIG_RECLAIM_ACCT
537 	unsigned long nr_reclaimed;
538 
539 	reclaimacct_substage_start(RA_SHRINKANON);
540 #endif
541 	if (is_active_lru(lru)) {
542 		if (sc->may_deactivate & (1 << is_file_lru(lru)))
543 			zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru);
544 		else
545 			sc->skipped_deactivate = 1;
546 #ifdef CONFIG_RECLAIM_ACCT
547 		reclaimacct_substage_end(RA_SHRINKANON, 0, NULL);
548 #endif
549 		return 0;
550 	}
551 
552 #ifdef CONFIG_RECLAIM_ACCT
553 	nr_reclaimed = shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
554 	reclaimacct_substage_end(RA_SHRINKANON, nr_reclaimed, NULL);
555 	return nr_reclaimed;
556 #else
557 	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
558 #endif
559 }
560 
zswapd_shrink_anon_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr)561 static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat,
562 	struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr)
563 {
564 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
565 	unsigned long nr_reclaimed = 0;
566 	unsigned long nr_to_scan;
567 	struct blk_plug plug;
568 	enum lru_list lru;
569 
570 	blk_start_plug(&plug);
571 
572 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
573 		for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
574 			if (nr[lru]) {
575 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
576 				nr[lru] -= nr_to_scan;
577 				nr_reclaimed += zswapd_shrink_list(lru,
578 							nr_to_scan, lruvec, sc);
579 			}
580 		}
581 	}
582 
583 	blk_finish_plug(&plug);
584 	sc->nr_reclaimed += nr_reclaimed;
585 }
586 #endif
587 
zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc)588 static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc)
589 {
590 	const unsigned int percent_constant = 100;
591 	struct mem_cgroup *memcg = NULL;
592 	unsigned long nr[NR_LRU_LISTS];
593 
594 	while ((memcg = get_next_memcg(memcg)) != NULL) {
595 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
596 		u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio;
597 
598 		/* reclaim and try to meet the high buffer watermark */
599 		if (high_buffer_is_suitable()) {
600 			get_next_memcg_break(memcg);
601 			break;
602 		}
603 
604 		if (get_memcg_anon_refault_status(memcg)) {
605 			count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP);
606 			continue;
607 		}
608 
609 		nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES);
610 		nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
611 		nr_zram = memcg_data_size(memcg, CACHE_PAGE);
612 		nr_eswap = memcg_data_size(memcg, SWAP_PAGE);
613 
614 		zram_ratio = div64_u64((nr_zram + nr_eswap) * percent_constant,
615 				(nr_inactive + nr_active + nr_zram + nr_eswap + 1));
616 		if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) {
617 			count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP);
618 			continue;
619 		}
620 
621 		nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority;
622 		nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority;
623 		nr[LRU_ACTIVE_FILE] = 0;
624 		nr[LRU_INACTIVE_FILE] = 0;
625 
626 #ifdef CONFIG_HYPERHOLD_FILE_LRU
627 		zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr);
628 #else
629 		shrink_lruvec(lruvec, sc);
630 #endif
631 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
632 
633 		if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
634 			get_next_memcg_break(memcg);
635 			break;
636 		}
637 	}
638 
639 	return sc->nr_scanned >= sc->nr_to_reclaim;
640 }
641 
__calc_nr_to_reclaim(void)642 static u64 __calc_nr_to_reclaim(void)
643 {
644 	unsigned int buffers;
645 	unsigned int high_buffers;
646 	unsigned int max_reclaim_size;
647 	u64 reclaim_size = 0;
648 
649 	high_buffers = get_high_avail_buffers();
650 	buffers = calc_sys_cur_avail_buffers();
651 	max_reclaim_size = get_zswapd_max_reclaim_size();
652 	if (buffers < high_buffers)
653 		reclaim_size = high_buffers - buffers;
654 
655 	/* once max reclaim target is max_reclaim_size */
656 	reclaim_size = min(reclaim_size, (u64)max_reclaim_size);
657 
658 	/* MB to pages */
659 	return div_u64(reclaim_size * SZ_1M, PAGE_SIZE);
660 }
661 
zswapd_shrink_node(pg_data_t *pgdat)662 static void zswapd_shrink_node(pg_data_t *pgdat)
663 {
664 	struct scan_control sc = {
665 		.gfp_mask = GFP_KERNEL,
666 		.order = 0,
667 		.priority = DEF_PRIORITY / 2,
668 		.may_writepage = !laptop_mode,
669 		.may_unmap = 1,
670 		.may_swap = 1,
671 		.reclaim_idx = MAX_NR_ZONES - 1,
672 	};
673 	const unsigned int increase_rate = 2;
674 
675 	do {
676 		unsigned long nr_reclaimed = sc.nr_reclaimed;
677 		bool raise_priority = true;
678 
679 		/* reclaim and try to meet the high buffer watermark */
680 		if (high_buffer_is_suitable())
681 			break;
682 
683 		sc.nr_scanned = 0;
684 		sc.nr_to_reclaim = __calc_nr_to_reclaim();
685 
686 		if (zswapd_shrink_anon(pgdat, &sc))
687 			raise_priority = false;
688 		count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned);
689 		count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed);
690 		if (try_to_freeze() || kthread_should_stop())
691 			break;
692 
693 		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
694 		if (raise_priority || !nr_reclaimed)
695 			sc.priority--;
696 	} while (sc.priority >= 1);
697 
698 	/*
699 	 * When meets the first empty round, set the interval to t.
700 	 * If the following round is still empty, set the intervall
701 	 * to 2t. If the round is always empty, then 4t, 8t, and so on.
702 	 * But make sure the interval is not more than the max_skip_interval.
703 	 * Once a non-empty round occurs, reset the interval to 0.
704 	 */
705 	if (sc.nr_reclaimed < get_empty_round_check_threshold()) {
706 		count_vm_event(ZSWAPD_EMPTY_ROUND);
707 		if (last_round_is_empty)
708 			zswapd_skip_interval = min(zswapd_skip_interval *
709 				increase_rate, get_max_skip_interval());
710 		else
711 			zswapd_skip_interval = get_empty_round_skip_interval();
712 		last_round_is_empty = true;
713 	} else {
714 		zswapd_skip_interval = 0;
715 		last_round_is_empty = false;
716 	}
717 }
718 
zram_watermark_diff(void)719 u64 zram_watermark_diff(void)
720 {
721 	const unsigned int percent_constant = 100;
722 	u64 nr_zram_used;
723 	u64 nr_wm;
724 	u64 ratio;
725 
726 	ratio = get_zram_current_watermark();
727 	nr_zram_used = get_zram_used_pages();
728 	nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
729 	if (nr_zram_used > nr_wm)
730 		return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM;
731 
732 	return 0;
733 }
734 
zswapd_buffer_diff(void)735 u64 zswapd_buffer_diff(void)
736 {
737 	u64 buffers;
738 	u64 avail;
739 
740 	buffers = calc_sys_cur_avail_buffers();
741 	avail = get_high_avail_buffers();
742 	if (buffers < avail)
743 		return (avail - buffers) * SZ_1M;
744 
745 	return 0;
746 }
747 
get_do_eswap_size(bool refault)748 u64 get_do_eswap_size(bool refault)
749 {
750 	u64 size = 0;
751 	enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
752 
753 	if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH)
754 		size = max(zram_watermark_diff(), zswapd_buffer_diff());
755 	else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault))
756 		size = zswapd_buffer_diff();
757 
758 	return size;
759 }
760 
zswapd(void *p)761 static int zswapd(void *p)
762 {
763 	struct task_struct *tsk = current;
764 	pg_data_t *pgdat = (pg_data_t *)p;
765 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
766 #ifdef CONFIG_RECLAIM_ACCT
767 	struct reclaim_acct ra = {0};
768 #endif
769 
770 	/* save zswapd pid for schedule strategy */
771 	zswapd_pid = tsk->pid;
772 
773 
774 	if (!cpumask_empty(cpumask))
775 		set_cpus_allowed_ptr(tsk, cpumask);
776 
777 	set_freezable();
778 
779 	while (!kthread_should_stop()) {
780 		bool refault = false;
781 		u64 size = 0;
782 
783 		(void)wait_event_freezable(pgdat->zswapd_wait,
784 			atomic_read(&pgdat->zswapd_wait_flag));
785 		atomic_set(&pgdat->zswapd_wait_flag, 0);
786 		count_vm_event(ZSWAPD_WAKEUP);
787 		zswapd_pressure_report(LEVEL_LOW);
788 
789 		if (get_area_anon_refault_status()) {
790 			refault = true;
791 			count_vm_event(ZSWAPD_REFAULT);
792 			goto do_eswap;
793 		}
794 
795 #ifdef CONFIG_RECLAIM_ACCT
796 		reclaimacct_start(ZSWAPD_RECLAIM, &ra);
797 #endif
798 		zswapd_shrink_node(pgdat);
799 #ifdef CONFIG_RECLAIM_ACCT
800 		reclaimacct_end(ZSWAPD_RECLAIM);
801 #endif
802 		last_zswapd_time = jiffies;
803 
804 do_eswap:
805 		size = get_do_eswap_size(refault);
806 		if (size >= SZ_1M) {
807 			count_vm_event(ZSWAPD_SWAPOUT);
808 			size = swapout(size);
809 		}
810 
811 		if (!buffer_is_suitable()) {
812 			if (free_swap_is_low() || zram_watermark_exceed()) {
813 				zswapd_pressure_report(LEVEL_CRITICAL);
814 				count_vm_event(ZSWAPD_CRITICAL_PRESS);
815 				pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__,
816 					get_zram_used_pages(), get_eswap_used_pages());
817 			} else {
818 				zswapd_pressure_report(LEVEL_MEDIUM);
819 				count_vm_event(ZSWAPD_MEDIUM_PRESS);
820 			}
821 		}
822 	}
823 
824 	return 0;
825 }
826 
827 /*
828  * This zswapd start function will be called by init and node-hot-add.
829  */
zswapd_run(int nid)830 int zswapd_run(int nid)
831 {
832 	const unsigned int priority_less = 5;
833 	struct sched_param param = {
834 		.sched_priority = MAX_PRIO - priority_less,
835 	};
836 	pg_data_t *pgdat = NODE_DATA(nid);
837 
838 	if (pgdat->zswapd)
839 		return 0;
840 
841 	atomic_set(&pgdat->zswapd_wait_flag, 0);
842 	pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid);
843 	if (IS_ERR(pgdat->zswapd)) {
844 		pr_err("Failed to start zswapd on node %d\n", nid);
845 		return PTR_ERR(pgdat->zswapd);
846 	}
847 
848 	sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, &param);
849 	set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority));
850 	wake_up_process(pgdat->zswapd);
851 
852 	return 0;
853 }
854 
855 /*
856  * Called by memory hotplug when all memory in a node is offlined. Caller must
857  * hold mem_hotplug_begin/end().
858  */
zswapd_stop(int nid)859 void zswapd_stop(int nid)
860 {
861 	struct task_struct *zswapd = NODE_DATA(nid)->zswapd;
862 
863 	if (zswapd) {
864 		kthread_stop(zswapd);
865 		NODE_DATA(nid)->zswapd = NULL;
866 	}
867 
868 	zswapd_pid = -1;
869 }
870 
871 /*
872  * It's optimal to keep kswapds on the same CPUs as their memory, but
873  * not required for correctness. So if the last cpu in a node goes away,
874  * we get changed to run anywhere: as the first one comes back, restore
875  * their cpu bindings.
876  */
zswapd_cpu_online(unsigned int cpu)877 static int zswapd_cpu_online(unsigned int cpu)
878 {
879 	int nid;
880 
881 	for_each_node_state(nid, N_MEMORY) {
882 		pg_data_t *pgdat = NODE_DATA(nid);
883 		const struct cpumask *mask;
884 
885 		mask = cpumask_of_node(pgdat->node_id);
886 		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
887 			/* One of our CPUs online: restore mask */
888 			set_cpus_allowed_ptr(pgdat->zswapd, mask);
889 	}
890 
891 	return 0;
892 }
893 
zswapd_init(void)894 static int __init zswapd_init(void)
895 {
896 	int nid;
897 	int ret;
898 
899 	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online",
900 					zswapd_cpu_online, NULL);
901 	if (ret < 0) {
902 		pr_err("zswapd: failed to register hotplug callbacks.\n");
903 		return ret;
904 	}
905 
906 	for_each_node_state(nid, N_MEMORY)
907 		zswapd_run(nid);
908 
909 	return 0;
910 }
911 module_init(zswapd_init)
912