1// SPDX-License-Identifier: GPL-2.0
2/*
3 * fs/hmdfs/client_writeback.c
4 *
5 * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
6 */
7
8#include <linux/backing-dev.h>
9#include <linux/file.h>
10#include <linux/fs.h>
11#include <linux/page-flags.h>
12#include <linux/pagemap.h>
13#include <linux/pagevec.h>
14#include <linux/sched/signal.h>
15#include <linux/slab.h>
16
17#include "hmdfs.h"
18#include "hmdfs_trace.h"
19
20/* 200ms */
21#define HMDFS_MAX_PAUSE			max((HZ / 5), 1)
22#define HMDFS_BANDWIDTH_INTERVAL	max((HZ / 5), 1)
23/* Dirty type */
24#define HMDFS_DIRTY_FS			0
25#define HMDFS_DIRTY_FILE		1
26/* Exceed flags */
27#define HMDFS_FS_EXCEED			(1 << HMDFS_DIRTY_FS)
28#define HMDFS_FILE_EXCEED		(1 << HMDFS_DIRTY_FILE)
29/* Ratelimit calculate shift */
30#define HMDFS_LIMIT_SHIFT		10
31
32void hmdfs_writeback_inodes_sb_handler(struct work_struct *work)
33{
34	struct hmdfs_writeback *hwb = container_of(
35		work, struct hmdfs_writeback, dirty_sb_writeback_work.work);
36
37	try_to_writeback_inodes_sb(hwb->sbi->sb, WB_REASON_FS_FREE_SPACE);
38}
39
40void hmdfs_writeback_inode_handler(struct work_struct *work)
41{
42	struct hmdfs_inode_info *info = NULL;
43	struct inode *inode = NULL;
44	struct hmdfs_writeback *hwb = container_of(
45		work, struct hmdfs_writeback, dirty_inode_writeback_work.work);
46
47	spin_lock(&hwb->inode_list_lock);
48	while (likely(!list_empty(&hwb->inode_list_head))) {
49		info = list_first_entry(&hwb->inode_list_head,
50					struct hmdfs_inode_info, wb_list);
51		list_del_init(&info->wb_list);
52		spin_unlock(&hwb->inode_list_lock);
53
54		inode = &info->vfs_inode;
55		write_inode_now(inode, 0);
56		iput(inode);
57		spin_lock(&hwb->inode_list_lock);
58	}
59	spin_unlock(&hwb->inode_list_lock);
60}
61
62static void hmdfs_writeback_inodes_sb_delayed(struct super_block *sb,
63					      unsigned int delay)
64{
65	struct hmdfs_sb_info *sbi = sb->s_fs_info;
66	unsigned long timeout;
67
68	timeout = msecs_to_jiffies(delay);
69	if (!timeout || !work_busy(&sbi->h_wb->dirty_sb_writeback_work.work))
70		mod_delayed_work(sbi->h_wb->dirty_sb_writeback_wq,
71				 &sbi->h_wb->dirty_sb_writeback_work, timeout);
72}
73
74static inline void hmdfs_writeback_inodes_sb(struct super_block *sb)
75{
76	hmdfs_writeback_inodes_sb_delayed(sb, 0);
77}
78
79static void hmdfs_writeback_inode(struct super_block *sb, struct inode *inode)
80{
81	struct hmdfs_sb_info *sbi = sb->s_fs_info;
82	struct hmdfs_writeback *hwb = sbi->h_wb;
83	struct hmdfs_inode_info *info = hmdfs_i(inode);
84
85	spin_lock(&hwb->inode_list_lock);
86	if (list_empty(&info->wb_list)) {
87		ihold(inode);
88		list_add_tail(&info->wb_list, &hwb->inode_list_head);
89		queue_delayed_work(hwb->dirty_inode_writeback_wq,
90				   &hwb->dirty_inode_writeback_work, 0);
91	}
92	spin_unlock(&hwb->inode_list_lock);
93}
94
95static unsigned long hmdfs_idirty_pages(struct inode *inode, int tag)
96{
97	struct pagevec pvec;
98	unsigned long nr_dirty_pages = 0;
99	pgoff_t index = 0;
100
101#if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE
102	pagevec_init(&pvec);
103#else
104	pagevec_init(&pvec, 0);
105#endif
106	while (pagevec_lookup_tag(&pvec, inode->i_mapping, &index, tag)) {
107		nr_dirty_pages += pagevec_count(&pvec);
108		pagevec_release(&pvec);
109		cond_resched();
110	}
111	return nr_dirty_pages;
112}
113
114static inline unsigned long hmdfs_ratio_thresh(unsigned long ratio,
115					       unsigned long thresh)
116{
117	unsigned long ret = (ratio * thresh) >> HMDFS_LIMIT_SHIFT;
118
119	return (ret == 0) ? 1 : ret;
120}
121
122static inline unsigned long hmdfs_thresh_ratio(unsigned long base,
123					       unsigned long thresh)
124{
125	unsigned long ratio = (base << HMDFS_LIMIT_SHIFT) / thresh;
126
127	return (ratio == 0) ? 1 : ratio;
128}
129
130void hmdfs_calculate_dirty_thresh(struct hmdfs_writeback *hwb)
131{
132	hwb->dirty_fs_thresh = DIV_ROUND_UP(hwb->dirty_fs_bytes, PAGE_SIZE);
133	hwb->dirty_file_thresh = DIV_ROUND_UP(hwb->dirty_file_bytes, PAGE_SIZE);
134	hwb->dirty_fs_bg_thresh =
135		DIV_ROUND_UP(hwb->dirty_fs_bg_bytes, PAGE_SIZE);
136	hwb->dirty_file_bg_thresh =
137		DIV_ROUND_UP(hwb->dirty_file_bg_bytes, PAGE_SIZE);
138
139	hwb->fs_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_fs_bg_thresh,
140					      hwb->dirty_fs_thresh);
141	hwb->file_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_file_bg_thresh,
142						hwb->dirty_file_thresh);
143	hwb->fs_file_ratio = hmdfs_thresh_ratio(hwb->dirty_file_thresh,
144						hwb->dirty_fs_thresh);
145}
146
147static void hmdfs_init_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc)
148{
149	struct hmdfs_writeback *hwb = hdtc->hwb;
150
151	hdtc->fs_thresh = hdtc->hwb->dirty_fs_thresh;
152	hdtc->file_thresh = hdtc->hwb->dirty_file_thresh;
153	hdtc->fs_bg_thresh = hdtc->hwb->dirty_fs_bg_thresh;
154	hdtc->file_bg_thresh = hdtc->hwb->dirty_file_bg_thresh;
155
156	if (!hwb->dirty_auto_threshold)
157		return;
158
159	/*
160	 * Init thresh according the previous bandwidth adjusted thresh,
161	 * thresh should be no more than setting thresh.
162	 */
163	if (hwb->bw_fs_thresh < hdtc->fs_thresh) {
164		hdtc->fs_thresh = hwb->bw_fs_thresh;
165		hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio,
166							hdtc->fs_thresh);
167	}
168	if (hwb->bw_file_thresh < hdtc->file_thresh) {
169		hdtc->file_thresh = hwb->bw_file_thresh;
170		hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio,
171							  hdtc->file_thresh);
172	}
173	/*
174	 * The thresh should be updated in the first time of dirty pages
175	 * exceed the freerun ceiling.
176	 */
177	hdtc->thresh_time_stamp = jiffies - HMDFS_BANDWIDTH_INTERVAL - 1;
178}
179
180static void hmdfs_update_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc)
181{
182	struct hmdfs_writeback *hwb = hdtc->hwb;
183	struct bdi_writeback *wb = hwb->wb;
184	unsigned int time_limit = hwb->writeback_timelimit;
185	unsigned long bw = wb->avg_write_bandwidth;
186	unsigned long thresh;
187
188	if (!hwb->dirty_auto_threshold)
189		return;
190
191	spin_lock(&hwb->write_bandwidth_lock);
192	if (bw > hwb->max_write_bandwidth)
193		hwb->max_write_bandwidth = bw;
194
195	if (bw < hwb->min_write_bandwidth)
196		hwb->min_write_bandwidth = bw;
197	hwb->avg_write_bandwidth = bw;
198	spin_unlock(&hwb->write_bandwidth_lock);
199
200	/*
201	 * If the bandwidth is lower than the lower limit, it may propably
202	 * offline, there is meaningless to set such a lower thresh.
203	 */
204	bw = max(bw, hwb->bw_thresh_lowerlimit);
205	thresh = bw * time_limit / roundup_pow_of_two(HZ);
206	if (thresh >= hwb->dirty_fs_thresh) {
207		hdtc->fs_thresh = hwb->dirty_fs_thresh;
208		hdtc->file_thresh = hwb->dirty_file_thresh;
209		hdtc->fs_bg_thresh = hwb->dirty_fs_bg_thresh;
210		hdtc->file_bg_thresh = hwb->dirty_file_bg_thresh;
211	} else {
212		/* Adjust thresh according to current bandwidth */
213		hdtc->fs_thresh = thresh;
214		hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio,
215							hdtc->fs_thresh);
216		hdtc->file_thresh = hmdfs_ratio_thresh(hwb->fs_file_ratio,
217						       hdtc->fs_thresh);
218		hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio,
219							  hdtc->file_thresh);
220	}
221	/* Save bandwidth adjusted thresh */
222	hwb->bw_fs_thresh = hdtc->fs_thresh;
223	hwb->bw_file_thresh = hdtc->file_thresh;
224	/* Update time stamp */
225	hdtc->thresh_time_stamp = jiffies;
226}
227
228void hmdfs_update_ratelimit(struct hmdfs_writeback *hwb)
229{
230	struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb};
231
232	hmdfs_init_dirty_limit(&hdtc);
233
234	/* hdtc.file_bg_thresh should be the lowest thresh */
235	hwb->ratelimit_pages = hdtc.file_bg_thresh /
236			       (num_online_cpus() * HMDFS_RATELIMIT_PAGES_GAP);
237	if (hwb->ratelimit_pages < HMDFS_MIN_RATELIMIT_PAGES)
238		hwb->ratelimit_pages = HMDFS_MIN_RATELIMIT_PAGES;
239}
240
241/* This is a copy of wb_max_pause() */
242static unsigned long hmdfs_wb_pause(struct bdi_writeback *wb,
243					unsigned long wb_dirty)
244{
245	unsigned long bw = wb->avg_write_bandwidth;
246	unsigned long t;
247
248	/*
249	 * Limit pause time for small memory systems. If sleeping for too long
250	 * time, a small pool of dirty/writeback pages may go empty and disk go
251	 * idle.
252	 *
253	 * 8 serves as the safety ratio.
254	 */
255	t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
256	t++;
257
258	return min_t(unsigned long, t, HMDFS_MAX_PAUSE);
259}
260
261static unsigned long
262hmdfs_dirty_freerun_ceiling(struct hmdfs_dirty_throttle_control *hdtc,
263			    unsigned int type)
264{
265	if (type == HMDFS_DIRTY_FS)
266		return (hdtc->fs_thresh + hdtc->fs_bg_thresh) / 2;
267	else /* HMDFS_DIRTY_FILE_TYPE */
268		return (hdtc->file_thresh + hdtc->file_bg_thresh) / 2;
269}
270
271/* This is a copy of dirty_poll_interval() */
272static inline unsigned long hmdfs_dirty_intv(unsigned long dirty,
273					     unsigned long thresh)
274{
275	if (thresh > dirty)
276		return 1UL << (ilog2(thresh - dirty) >> 1);
277	return 1;
278}
279
280static void hmdfs_balance_dirty_pages(struct address_space *mapping)
281{
282	struct inode *inode = mapping->host;
283	struct super_block *sb = inode->i_sb;
284	struct hmdfs_sb_info *sbi = sb->s_fs_info;
285	struct hmdfs_writeback *hwb = sbi->h_wb;
286	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
287	struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb};
288	unsigned int dirty_exceeded = 0;
289	unsigned long start_time = jiffies;
290	unsigned long pause = 0;
291
292	/* Add delay work to trigger timeout writeback */
293	if (hwb->dirty_writeback_interval != 0)
294		hmdfs_writeback_inodes_sb_delayed(
295			sb, hwb->dirty_writeback_interval * 10);
296
297	hmdfs_init_dirty_limit(&hdtc);
298
299	while (1) {
300		unsigned long exceed = 0;
301		unsigned long diff;
302
303		/* Per-filesystem overbalance writeback */
304		hdtc.fs_nr_dirty = wb_stat_sum(wb, WB_RECLAIMABLE);
305		hdtc.fs_nr_reclaimable =
306			hdtc.fs_nr_dirty + wb_stat_sum(wb, WB_WRITEBACK);
307		if (hdtc.fs_nr_reclaimable < hdtc.file_bg_thresh) {
308			diff = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable,
309						hdtc.file_thresh);
310			goto free_running;
311		}
312
313		/* Per-file overbalance writeback */
314		hdtc.file_nr_dirty =
315			hmdfs_idirty_pages(inode, PAGECACHE_TAG_DIRTY);
316		hdtc.file_nr_reclaimable =
317			hmdfs_idirty_pages(inode, PAGECACHE_TAG_WRITEBACK) +
318			hdtc.file_nr_dirty;
319		if ((hdtc.fs_nr_reclaimable <
320		     hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) &&
321		    (hdtc.file_nr_reclaimable <
322		     hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FILE))) {
323			unsigned long fs_intv, file_intv;
324
325			fs_intv = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable,
326						   hdtc.fs_thresh);
327			file_intv = hmdfs_dirty_intv(hdtc.file_nr_reclaimable,
328						     hdtc.file_thresh);
329			diff = min(fs_intv, file_intv);
330free_running:
331			current->nr_dirtied_pause = diff;
332			current->nr_dirtied = 0;
333			break;
334		}
335
336		if (hdtc.fs_nr_reclaimable >=
337		    hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) {
338			if (unlikely(!writeback_in_progress(wb)))
339				hmdfs_writeback_inodes_sb(sb);
340		} else {
341			hmdfs_writeback_inode(sb, inode);
342		}
343
344		/*
345		 * If dirty_auto_threshold is enabled, recalculate writeback
346		 * thresh according to current bandwidth. Update bandwidth
347		 * could be better if possible, but wb_update_bandwidth() is
348		 * not exported, so we cannot update bandwidth here, so the
349		 * bandwidth' update will be delayed if writing a lot to a
350		 * single file.
351		 */
352		if (hwb->dirty_auto_threshold &&
353		    time_is_before_jiffies(hdtc.thresh_time_stamp +
354					   HMDFS_BANDWIDTH_INTERVAL))
355			hmdfs_update_dirty_limit(&hdtc);
356
357		if (unlikely(hdtc.fs_nr_reclaimable >= hdtc.fs_thresh))
358			exceed |= HMDFS_FS_EXCEED;
359		if (unlikely(hdtc.file_nr_reclaimable >= hdtc.file_thresh))
360			exceed |= HMDFS_FILE_EXCEED;
361
362		if (!exceed) {
363			trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc,
364							0UL, start_time);
365			current->nr_dirtied = 0;
366			break;
367		}
368		/*
369		 * Per-file or per-fs reclaimable pages exceed throttle limit,
370		 * sleep pause time and check again.
371		 */
372		dirty_exceeded |= exceed;
373		if (dirty_exceeded && !hwb->dirty_exceeded)
374			hwb->dirty_exceeded = true;
375
376		/* Pause */
377		pause = hmdfs_wb_pause(wb, hdtc.fs_nr_reclaimable);
378
379		trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc, pause,
380						start_time);
381
382		__set_current_state(TASK_KILLABLE);
383		io_schedule_timeout(pause);
384
385		if (fatal_signal_pending(current))
386			break;
387	}
388
389	if (!dirty_exceeded && hwb->dirty_exceeded)
390		hwb->dirty_exceeded = false;
391
392	if (hdtc.fs_nr_reclaimable >= hdtc.fs_bg_thresh) {
393		if (unlikely(!writeback_in_progress(wb)))
394			hmdfs_writeback_inodes_sb(sb);
395	} else if (hdtc.file_nr_reclaimable >= hdtc.file_bg_thresh) {
396		hmdfs_writeback_inode(sb, inode);
397	}
398}
399
400void hmdfs_balance_dirty_pages_ratelimited(struct address_space *mapping)
401{
402	struct hmdfs_sb_info *sbi = mapping->host->i_sb->s_fs_info;
403	struct hmdfs_writeback *hwb = sbi->h_wb;
404	int *bdp_ratelimits = NULL;
405	int ratelimit;
406
407	if (!hwb->dirty_writeback_control)
408		return;
409
410	/* Add delay work to trigger timeout writeback */
411	if (hwb->dirty_writeback_interval != 0)
412		hmdfs_writeback_inodes_sb_delayed(
413			mapping->host->i_sb,
414			hwb->dirty_writeback_interval * 10);
415
416	ratelimit = current->nr_dirtied_pause;
417	if (hwb->dirty_exceeded)
418		ratelimit = min(ratelimit, HMDFS_DIRTY_EXCEED_RATELIMIT);
419
420	/*
421	 * This prevents one CPU to accumulate too many dirtied pages
422	 * without calling into hmdfs_balance_dirty_pages(), which can
423	 * happen when there are 1000+ tasks, all of them start dirtying
424	 * pages at exactly the same time, hence all honoured too large
425	 * initial task->nr_dirtied_pause.
426	 */
427	preempt_disable();
428	bdp_ratelimits = this_cpu_ptr(hwb->bdp_ratelimits);
429
430	trace_hmdfs_balance_dirty_pages_ratelimited(sbi, hwb, *bdp_ratelimits);
431
432	if (unlikely(current->nr_dirtied >= ratelimit)) {
433		*bdp_ratelimits = 0;
434	} else if (unlikely(*bdp_ratelimits >= hwb->ratelimit_pages)) {
435		*bdp_ratelimits = 0;
436		ratelimit = 0;
437	}
438	preempt_enable();
439
440	if (unlikely(current->nr_dirtied >= ratelimit))
441		hmdfs_balance_dirty_pages(mapping);
442}
443
444void hmdfs_destroy_writeback(struct hmdfs_sb_info *sbi)
445{
446	if (!sbi->h_wb)
447		return;
448
449	flush_delayed_work(&sbi->h_wb->dirty_sb_writeback_work);
450	flush_delayed_work(&sbi->h_wb->dirty_inode_writeback_work);
451	destroy_workqueue(sbi->h_wb->dirty_sb_writeback_wq);
452	destroy_workqueue(sbi->h_wb->dirty_inode_writeback_wq);
453	free_percpu(sbi->h_wb->bdp_ratelimits);
454	kfree(sbi->h_wb);
455	sbi->h_wb = NULL;
456}
457
458int hmdfs_init_writeback(struct hmdfs_sb_info *sbi)
459{
460	struct hmdfs_writeback *hwb;
461	char name[HMDFS_WQ_NAME_LEN];
462	int ret = -ENOMEM;
463
464	hwb = kzalloc(sizeof(struct hmdfs_writeback), GFP_KERNEL);
465	if (!hwb)
466		return ret;
467
468	hwb->sbi = sbi;
469	hwb->wb = &sbi->sb->s_bdi->wb;
470	hwb->dirty_writeback_control = true;
471	hwb->dirty_writeback_interval = HM_DEFAULT_WRITEBACK_INTERVAL;
472	hwb->dirty_file_bg_bytes = HMDFS_FILE_BG_WB_BYTES;
473	hwb->dirty_fs_bg_bytes = HMDFS_FS_BG_WB_BYTES;
474	hwb->dirty_file_bytes = HMDFS_FILE_WB_BYTES;
475	hwb->dirty_fs_bytes = HMDFS_FS_WB_BYTES;
476	hmdfs_calculate_dirty_thresh(hwb);
477	hwb->bw_file_thresh = hwb->dirty_file_thresh;
478	hwb->bw_fs_thresh = hwb->dirty_fs_thresh;
479	spin_lock_init(&hwb->inode_list_lock);
480	INIT_LIST_HEAD(&hwb->inode_list_head);
481	hwb->dirty_exceeded = false;
482	hwb->ratelimit_pages = HMDFS_DEF_RATELIMIT_PAGES;
483	hwb->dirty_auto_threshold = true;
484	hwb->writeback_timelimit = HMDFS_DEF_WB_TIMELIMIT;
485	hwb->bw_thresh_lowerlimit = HMDFS_BW_THRESH_DEF_LIMIT;
486	spin_lock_init(&hwb->write_bandwidth_lock);
487	hwb->avg_write_bandwidth = 0;
488	hwb->max_write_bandwidth = 0;
489	hwb->min_write_bandwidth = ULONG_MAX;
490	hwb->bdp_ratelimits = alloc_percpu(int);
491	if (!hwb->bdp_ratelimits)
492		goto free_hwb;
493
494	snprintf(name, sizeof(name), "dfs_ino_wb%u", sbi->seq);
495	hwb->dirty_inode_writeback_wq = create_singlethread_workqueue(name);
496	if (!hwb->dirty_inode_writeback_wq) {
497		hmdfs_err("Failed to create inode writeback workqueue!");
498		goto free_bdp;
499	}
500	snprintf(name, sizeof(name), "dfs_sb_wb%u", sbi->seq);
501	hwb->dirty_sb_writeback_wq = create_singlethread_workqueue(name);
502	if (!hwb->dirty_sb_writeback_wq) {
503		hmdfs_err("Failed to create filesystem writeback workqueue!");
504		goto free_i_wq;
505	}
506	INIT_DELAYED_WORK(&hwb->dirty_sb_writeback_work,
507			  hmdfs_writeback_inodes_sb_handler);
508	INIT_DELAYED_WORK(&hwb->dirty_inode_writeback_work,
509			  hmdfs_writeback_inode_handler);
510	sbi->h_wb = hwb;
511	return 0;
512free_i_wq:
513	destroy_workqueue(hwb->dirty_inode_writeback_wq);
514free_bdp:
515	free_percpu(hwb->bdp_ratelimits);
516free_hwb:
517	kfree(hwb);
518	return ret;
519}
520