1// SPDX-License-Identifier: GPL-2.0
2/*
3 * fs/hmdfs/client_writeback.c
4 *
5 * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
6 */
7
8#include <linux/backing-dev.h>
9#include <linux/file.h>
10#include <linux/fs.h>
11#include <linux/page-flags.h>
12#include <linux/pagemap.h>
13#include <linux/pagevec.h>
14#include <linux/sched/signal.h>
15#include <linux/slab.h>
16
17#include "hmdfs.h"
18#include "hmdfs_trace.h"
19
20/* 200ms */
21#define HMDFS_MAX_PAUSE			max((HZ / 5), 1)
22#define HMDFS_BANDWIDTH_INTERVAL	max((HZ / 5), 1)
23/* Dirty type */
24#define HMDFS_DIRTY_FS			0
25#define HMDFS_DIRTY_FILE		1
26/* Exceed flags */
27#define HMDFS_FS_EXCEED			(1 << HMDFS_DIRTY_FS)
28#define HMDFS_FILE_EXCEED		(1 << HMDFS_DIRTY_FILE)
29/* Ratelimit calculate shift */
30#define HMDFS_LIMIT_SHIFT		10
31
32void hmdfs_writeback_inodes_sb_handler(struct work_struct *work)
33{
34	struct hmdfs_writeback *hwb = container_of(
35		work, struct hmdfs_writeback, dirty_sb_writeback_work.work);
36
37	try_to_writeback_inodes_sb(hwb->sbi->sb, WB_REASON_FS_FREE_SPACE);
38}
39
40void hmdfs_writeback_inode_handler(struct work_struct *work)
41{
42	struct hmdfs_inode_info *info = NULL;
43	struct inode *inode = NULL;
44	struct hmdfs_writeback *hwb = container_of(
45		work, struct hmdfs_writeback, dirty_inode_writeback_work.work);
46
47	spin_lock(&hwb->inode_list_lock);
48	while (likely(!list_empty(&hwb->inode_list_head))) {
49		info = list_first_entry(&hwb->inode_list_head,
50					struct hmdfs_inode_info, wb_list);
51		list_del_init(&info->wb_list);
52		spin_unlock(&hwb->inode_list_lock);
53
54		inode = &info->vfs_inode;
55		write_inode_now(inode, 0);
56		iput(inode);
57		spin_lock(&hwb->inode_list_lock);
58	}
59	spin_unlock(&hwb->inode_list_lock);
60}
61
62static void hmdfs_writeback_inodes_sb_delayed(struct super_block *sb,
63					      unsigned int delay)
64{
65	struct hmdfs_sb_info *sbi = sb->s_fs_info;
66	unsigned long timeout;
67
68	timeout = msecs_to_jiffies(delay);
69	if (!timeout || !work_busy(&sbi->h_wb->dirty_sb_writeback_work.work))
70		mod_delayed_work(sbi->h_wb->dirty_sb_writeback_wq,
71				 &sbi->h_wb->dirty_sb_writeback_work, timeout);
72}
73
74static inline void hmdfs_writeback_inodes_sb(struct super_block *sb)
75{
76	hmdfs_writeback_inodes_sb_delayed(sb, 0);
77}
78
79static void hmdfs_writeback_inode(struct super_block *sb, struct inode *inode)
80{
81	struct hmdfs_sb_info *sbi = sb->s_fs_info;
82	struct hmdfs_writeback *hwb = sbi->h_wb;
83	struct hmdfs_inode_info *info = hmdfs_i(inode);
84
85	spin_lock(&hwb->inode_list_lock);
86	if (list_empty(&info->wb_list)) {
87		ihold(inode);
88		list_add_tail(&info->wb_list, &hwb->inode_list_head);
89		queue_delayed_work(hwb->dirty_inode_writeback_wq,
90				   &hwb->dirty_inode_writeback_work, 0);
91	}
92	spin_unlock(&hwb->inode_list_lock);
93}
94
95static unsigned long hmdfs_idirty_pages(struct inode *inode, int tag)
96{
97#if KERNEL_VERSION(6, 3, 0) <= LINUX_VERSION_CODE
98	struct folio_batch fbatch;
99#else
100	struct pagevec pvec;
101#endif
102	unsigned long nr_dirty_pages = 0;
103	pgoff_t index = 0;
104
105#if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE
106#if KERNEL_VERSION(6, 3, 0) <= LINUX_VERSION_CODE
107	folio_batch_init(&fbatch);
108#else
109	pagevec_init(&pvec);
110#endif
111#else
112	pagevec_init(&pvec, 0);
113#endif
114
115#if KERNEL_VERSION(6, 3, 0) <= LINUX_VERSION_CODE
116	while (filemap_get_folios_tag(inode->i_mapping, &index,
117	        (pgoff_t)-1, tag, &fbatch)) {
118		for (int i = 0; i < fbatch.nr; i++) {
119			struct folio *folio = fbatch.folios[i];
120			if (folio_test_dirty(folio) || folio_test_writeback(folio)) {
121				nr_dirty_pages++;
122			}
123		}
124		folio_batch_release(&fbatch);
125		cond_resched();
126	}
127#else
128	while (pagevec_lookup_tag(&pvec, inode->i_mapping, &index, tag)) {
129		nr_dirty_pages += pagevec_count(&pvec);
130		pagevec_release(&pvec);
131		cond_resched();
132	}
133#endif
134
135	return nr_dirty_pages;
136}
137
138static inline unsigned long hmdfs_ratio_thresh(unsigned long ratio,
139					       unsigned long thresh)
140{
141	unsigned long ret = (ratio * thresh) >> HMDFS_LIMIT_SHIFT;
142
143	return (ret == 0) ? 1 : ret;
144}
145
146static inline unsigned long hmdfs_thresh_ratio(unsigned long base,
147					       unsigned long thresh)
148{
149	unsigned long ratio = (base << HMDFS_LIMIT_SHIFT) / thresh;
150
151	return (ratio == 0) ? 1 : ratio;
152}
153
154void hmdfs_calculate_dirty_thresh(struct hmdfs_writeback *hwb)
155{
156	hwb->dirty_fs_thresh = DIV_ROUND_UP(hwb->dirty_fs_bytes, PAGE_SIZE);
157	hwb->dirty_file_thresh = DIV_ROUND_UP(hwb->dirty_file_bytes, PAGE_SIZE);
158	hwb->dirty_fs_bg_thresh =
159		DIV_ROUND_UP(hwb->dirty_fs_bg_bytes, PAGE_SIZE);
160	hwb->dirty_file_bg_thresh =
161		DIV_ROUND_UP(hwb->dirty_file_bg_bytes, PAGE_SIZE);
162
163	hwb->fs_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_fs_bg_thresh,
164					      hwb->dirty_fs_thresh);
165	hwb->file_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_file_bg_thresh,
166						hwb->dirty_file_thresh);
167	hwb->fs_file_ratio = hmdfs_thresh_ratio(hwb->dirty_file_thresh,
168						hwb->dirty_fs_thresh);
169}
170
171static void hmdfs_init_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc)
172{
173	struct hmdfs_writeback *hwb = hdtc->hwb;
174
175	hdtc->fs_thresh = hdtc->hwb->dirty_fs_thresh;
176	hdtc->file_thresh = hdtc->hwb->dirty_file_thresh;
177	hdtc->fs_bg_thresh = hdtc->hwb->dirty_fs_bg_thresh;
178	hdtc->file_bg_thresh = hdtc->hwb->dirty_file_bg_thresh;
179
180	if (!hwb->dirty_auto_threshold)
181		return;
182
183	/*
184	 * Init thresh according the previous bandwidth adjusted thresh,
185	 * thresh should be no more than setting thresh.
186	 */
187	if (hwb->bw_fs_thresh < hdtc->fs_thresh) {
188		hdtc->fs_thresh = hwb->bw_fs_thresh;
189		hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio,
190							hdtc->fs_thresh);
191	}
192	if (hwb->bw_file_thresh < hdtc->file_thresh) {
193		hdtc->file_thresh = hwb->bw_file_thresh;
194		hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio,
195							  hdtc->file_thresh);
196	}
197	/*
198	 * The thresh should be updated in the first time of dirty pages
199	 * exceed the freerun ceiling.
200	 */
201	hdtc->thresh_time_stamp = jiffies - HMDFS_BANDWIDTH_INTERVAL - 1;
202}
203
204static void hmdfs_update_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc)
205{
206	struct hmdfs_writeback *hwb = hdtc->hwb;
207	struct bdi_writeback *wb = hwb->wb;
208	unsigned int time_limit = hwb->writeback_timelimit;
209	unsigned long bw = wb->avg_write_bandwidth;
210	unsigned long thresh;
211
212	if (!hwb->dirty_auto_threshold)
213		return;
214
215	spin_lock(&hwb->write_bandwidth_lock);
216	if (bw > hwb->max_write_bandwidth)
217		hwb->max_write_bandwidth = bw;
218
219	if (bw < hwb->min_write_bandwidth)
220		hwb->min_write_bandwidth = bw;
221	hwb->avg_write_bandwidth = bw;
222	spin_unlock(&hwb->write_bandwidth_lock);
223
224	/*
225	 * If the bandwidth is lower than the lower limit, it may propably
226	 * offline, there is meaningless to set such a lower thresh.
227	 */
228	bw = max(bw, hwb->bw_thresh_lowerlimit);
229	thresh = bw * time_limit / roundup_pow_of_two(HZ);
230	if (thresh >= hwb->dirty_fs_thresh) {
231		hdtc->fs_thresh = hwb->dirty_fs_thresh;
232		hdtc->file_thresh = hwb->dirty_file_thresh;
233		hdtc->fs_bg_thresh = hwb->dirty_fs_bg_thresh;
234		hdtc->file_bg_thresh = hwb->dirty_file_bg_thresh;
235	} else {
236		/* Adjust thresh according to current bandwidth */
237		hdtc->fs_thresh = thresh;
238		hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio,
239							hdtc->fs_thresh);
240		hdtc->file_thresh = hmdfs_ratio_thresh(hwb->fs_file_ratio,
241						       hdtc->fs_thresh);
242		hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio,
243							  hdtc->file_thresh);
244	}
245	/* Save bandwidth adjusted thresh */
246	hwb->bw_fs_thresh = hdtc->fs_thresh;
247	hwb->bw_file_thresh = hdtc->file_thresh;
248	/* Update time stamp */
249	hdtc->thresh_time_stamp = jiffies;
250}
251
252void hmdfs_update_ratelimit(struct hmdfs_writeback *hwb)
253{
254	struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb};
255
256	hmdfs_init_dirty_limit(&hdtc);
257
258	/* hdtc.file_bg_thresh should be the lowest thresh */
259	hwb->ratelimit_pages = hdtc.file_bg_thresh /
260			       (num_online_cpus() * HMDFS_RATELIMIT_PAGES_GAP);
261	if (hwb->ratelimit_pages < HMDFS_MIN_RATELIMIT_PAGES)
262		hwb->ratelimit_pages = HMDFS_MIN_RATELIMIT_PAGES;
263}
264
265/* This is a copy of wb_max_pause() */
266static unsigned long hmdfs_wb_pause(struct bdi_writeback *wb,
267					unsigned long wb_dirty)
268{
269	unsigned long bw = wb->avg_write_bandwidth;
270	unsigned long t;
271
272	/*
273	 * Limit pause time for small memory systems. If sleeping for too long
274	 * time, a small pool of dirty/writeback pages may go empty and disk go
275	 * idle.
276	 *
277	 * 8 serves as the safety ratio.
278	 */
279	t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
280	t++;
281
282	return min_t(unsigned long, t, HMDFS_MAX_PAUSE);
283}
284
285static unsigned long
286hmdfs_dirty_freerun_ceiling(struct hmdfs_dirty_throttle_control *hdtc,
287			    unsigned int type)
288{
289	if (type == HMDFS_DIRTY_FS)
290		return (hdtc->fs_thresh + hdtc->fs_bg_thresh) / 2;
291	else /* HMDFS_DIRTY_FILE_TYPE */
292		return (hdtc->file_thresh + hdtc->file_bg_thresh) / 2;
293}
294
295/* This is a copy of dirty_poll_interval() */
296static inline unsigned long hmdfs_dirty_intv(unsigned long dirty,
297					     unsigned long thresh)
298{
299	if (thresh > dirty)
300		return 1UL << (ilog2(thresh - dirty) >> 1);
301	return 1;
302}
303
304static void hmdfs_balance_dirty_pages(struct address_space *mapping)
305{
306	struct inode *inode = mapping->host;
307	struct super_block *sb = inode->i_sb;
308	struct hmdfs_sb_info *sbi = sb->s_fs_info;
309	struct hmdfs_writeback *hwb = sbi->h_wb;
310	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
311	struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb};
312	unsigned int dirty_exceeded = 0;
313	unsigned long start_time = jiffies;
314	unsigned long pause = 0;
315
316	/* Add delay work to trigger timeout writeback */
317	if (hwb->dirty_writeback_interval != 0)
318		hmdfs_writeback_inodes_sb_delayed(
319			sb, hwb->dirty_writeback_interval * 10);
320
321	hmdfs_init_dirty_limit(&hdtc);
322
323	while (1) {
324		unsigned long exceed = 0;
325		unsigned long diff;
326
327		/* Per-filesystem overbalance writeback */
328		hdtc.fs_nr_dirty = wb_stat_sum(wb, WB_RECLAIMABLE);
329		hdtc.fs_nr_reclaimable =
330			hdtc.fs_nr_dirty + wb_stat_sum(wb, WB_WRITEBACK);
331		if (hdtc.fs_nr_reclaimable < hdtc.file_bg_thresh) {
332			diff = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable,
333						hdtc.file_thresh);
334			goto free_running;
335		}
336
337		/* Per-file overbalance writeback */
338		hdtc.file_nr_dirty =
339			hmdfs_idirty_pages(inode, PAGECACHE_TAG_DIRTY);
340		hdtc.file_nr_reclaimable =
341			hmdfs_idirty_pages(inode, PAGECACHE_TAG_WRITEBACK) +
342			hdtc.file_nr_dirty;
343		if ((hdtc.fs_nr_reclaimable <
344		     hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) &&
345		    (hdtc.file_nr_reclaimable <
346		     hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FILE))) {
347			unsigned long fs_intv, file_intv;
348
349			fs_intv = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable,
350						   hdtc.fs_thresh);
351			file_intv = hmdfs_dirty_intv(hdtc.file_nr_reclaimable,
352						     hdtc.file_thresh);
353			diff = min(fs_intv, file_intv);
354free_running:
355			current->nr_dirtied_pause = diff;
356			current->nr_dirtied = 0;
357			break;
358		}
359
360		if (hdtc.fs_nr_reclaimable >=
361		    hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) {
362			if (unlikely(!writeback_in_progress(wb)))
363				hmdfs_writeback_inodes_sb(sb);
364		} else {
365			hmdfs_writeback_inode(sb, inode);
366		}
367
368		/*
369		 * If dirty_auto_threshold is enabled, recalculate writeback
370		 * thresh according to current bandwidth. Update bandwidth
371		 * could be better if possible, but wb_update_bandwidth() is
372		 * not exported, so we cannot update bandwidth here, so the
373		 * bandwidth' update will be delayed if writing a lot to a
374		 * single file.
375		 */
376		if (hwb->dirty_auto_threshold &&
377		    time_is_before_jiffies(hdtc.thresh_time_stamp +
378					   HMDFS_BANDWIDTH_INTERVAL))
379			hmdfs_update_dirty_limit(&hdtc);
380
381		if (unlikely(hdtc.fs_nr_reclaimable >= hdtc.fs_thresh))
382			exceed |= HMDFS_FS_EXCEED;
383		if (unlikely(hdtc.file_nr_reclaimable >= hdtc.file_thresh))
384			exceed |= HMDFS_FILE_EXCEED;
385
386		if (!exceed) {
387			trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc,
388							0UL, start_time);
389			current->nr_dirtied = 0;
390			break;
391		}
392		/*
393		 * Per-file or per-fs reclaimable pages exceed throttle limit,
394		 * sleep pause time and check again.
395		 */
396		dirty_exceeded |= exceed;
397		if (dirty_exceeded && !hwb->dirty_exceeded)
398			hwb->dirty_exceeded = true;
399
400		/* Pause */
401		pause = hmdfs_wb_pause(wb, hdtc.fs_nr_reclaimable);
402
403		trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc, pause,
404						start_time);
405
406		__set_current_state(TASK_KILLABLE);
407		io_schedule_timeout(pause);
408
409		if (fatal_signal_pending(current))
410			break;
411	}
412
413	if (!dirty_exceeded && hwb->dirty_exceeded)
414		hwb->dirty_exceeded = false;
415
416	if (hdtc.fs_nr_reclaimable >= hdtc.fs_bg_thresh) {
417		if (unlikely(!writeback_in_progress(wb)))
418			hmdfs_writeback_inodes_sb(sb);
419	} else if (hdtc.file_nr_reclaimable >= hdtc.file_bg_thresh) {
420		hmdfs_writeback_inode(sb, inode);
421	}
422}
423
424void hmdfs_balance_dirty_pages_ratelimited(struct address_space *mapping)
425{
426	struct hmdfs_sb_info *sbi = mapping->host->i_sb->s_fs_info;
427	struct hmdfs_writeback *hwb = sbi->h_wb;
428	int *bdp_ratelimits = NULL;
429	int ratelimit;
430
431	if (!hwb->dirty_writeback_control)
432		return;
433
434	/* Add delay work to trigger timeout writeback */
435	if (hwb->dirty_writeback_interval != 0)
436		hmdfs_writeback_inodes_sb_delayed(
437			mapping->host->i_sb,
438			hwb->dirty_writeback_interval * 10);
439
440	ratelimit = current->nr_dirtied_pause;
441	if (hwb->dirty_exceeded)
442		ratelimit = min(ratelimit, HMDFS_DIRTY_EXCEED_RATELIMIT);
443
444	/*
445	 * This prevents one CPU to accumulate too many dirtied pages
446	 * without calling into hmdfs_balance_dirty_pages(), which can
447	 * happen when there are 1000+ tasks, all of them start dirtying
448	 * pages at exactly the same time, hence all honoured too large
449	 * initial task->nr_dirtied_pause.
450	 */
451	preempt_disable();
452	bdp_ratelimits = this_cpu_ptr(hwb->bdp_ratelimits);
453
454	trace_hmdfs_balance_dirty_pages_ratelimited(sbi, hwb, *bdp_ratelimits);
455
456	if (unlikely(current->nr_dirtied >= ratelimit)) {
457		*bdp_ratelimits = 0;
458	} else if (unlikely(*bdp_ratelimits >= hwb->ratelimit_pages)) {
459		*bdp_ratelimits = 0;
460		ratelimit = 0;
461	}
462	preempt_enable();
463
464	if (unlikely(current->nr_dirtied >= ratelimit))
465		hmdfs_balance_dirty_pages(mapping);
466}
467
468void hmdfs_destroy_writeback(struct hmdfs_sb_info *sbi)
469{
470	if (!sbi->h_wb)
471		return;
472
473	flush_delayed_work(&sbi->h_wb->dirty_sb_writeback_work);
474	flush_delayed_work(&sbi->h_wb->dirty_inode_writeback_work);
475	destroy_workqueue(sbi->h_wb->dirty_sb_writeback_wq);
476	destroy_workqueue(sbi->h_wb->dirty_inode_writeback_wq);
477	free_percpu(sbi->h_wb->bdp_ratelimits);
478	kfree(sbi->h_wb);
479	sbi->h_wb = NULL;
480}
481
482int hmdfs_init_writeback(struct hmdfs_sb_info *sbi)
483{
484	struct hmdfs_writeback *hwb;
485	char name[HMDFS_WQ_NAME_LEN];
486	int ret = -ENOMEM;
487
488	hwb = kzalloc(sizeof(struct hmdfs_writeback), GFP_KERNEL);
489	if (!hwb)
490		return ret;
491
492	hwb->sbi = sbi;
493	hwb->wb = &sbi->sb->s_bdi->wb;
494	hwb->dirty_writeback_control = true;
495	hwb->dirty_writeback_interval = HM_DEFAULT_WRITEBACK_INTERVAL;
496	hwb->dirty_file_bg_bytes = HMDFS_FILE_BG_WB_BYTES;
497	hwb->dirty_fs_bg_bytes = HMDFS_FS_BG_WB_BYTES;
498	hwb->dirty_file_bytes = HMDFS_FILE_WB_BYTES;
499	hwb->dirty_fs_bytes = HMDFS_FS_WB_BYTES;
500	hmdfs_calculate_dirty_thresh(hwb);
501	hwb->bw_file_thresh = hwb->dirty_file_thresh;
502	hwb->bw_fs_thresh = hwb->dirty_fs_thresh;
503	spin_lock_init(&hwb->inode_list_lock);
504	INIT_LIST_HEAD(&hwb->inode_list_head);
505	hwb->dirty_exceeded = false;
506	hwb->ratelimit_pages = HMDFS_DEF_RATELIMIT_PAGES;
507	hwb->dirty_auto_threshold = true;
508	hwb->writeback_timelimit = HMDFS_DEF_WB_TIMELIMIT;
509	hwb->bw_thresh_lowerlimit = HMDFS_BW_THRESH_DEF_LIMIT;
510	spin_lock_init(&hwb->write_bandwidth_lock);
511	hwb->avg_write_bandwidth = 0;
512	hwb->max_write_bandwidth = 0;
513	hwb->min_write_bandwidth = ULONG_MAX;
514	hwb->bdp_ratelimits = alloc_percpu(int);
515	if (!hwb->bdp_ratelimits)
516		goto free_hwb;
517
518	snprintf(name, sizeof(name), "dfs_ino_wb%u", sbi->seq);
519	hwb->dirty_inode_writeback_wq = create_singlethread_workqueue(name);
520	if (!hwb->dirty_inode_writeback_wq) {
521		hmdfs_err("Failed to create inode writeback workqueue!");
522		goto free_bdp;
523	}
524	snprintf(name, sizeof(name), "dfs_sb_wb%u", sbi->seq);
525	hwb->dirty_sb_writeback_wq = create_singlethread_workqueue(name);
526	if (!hwb->dirty_sb_writeback_wq) {
527		hmdfs_err("Failed to create filesystem writeback workqueue!");
528		goto free_i_wq;
529	}
530	INIT_DELAYED_WORK(&hwb->dirty_sb_writeback_work,
531			  hmdfs_writeback_inodes_sb_handler);
532	INIT_DELAYED_WORK(&hwb->dirty_inode_writeback_work,
533			  hmdfs_writeback_inode_handler);
534	sbi->h_wb = hwb;
535	return 0;
536free_i_wq:
537	destroy_workqueue(hwb->dirty_inode_writeback_wq);
538free_bdp:
539	free_percpu(hwb->bdp_ratelimits);
540free_hwb:
541	kfree(hwb);
542	return ret;
543}
544