1// SPDX-License-Identifier: GPL-2.0 2/* 3 * fs/hmdfs/client_writeback.c 4 * 5 * Copyright (c) 2020-2021 Huawei Device Co., Ltd. 6 */ 7 8#include <linux/backing-dev.h> 9#include <linux/file.h> 10#include <linux/fs.h> 11#include <linux/page-flags.h> 12#include <linux/pagemap.h> 13#include <linux/pagevec.h> 14#include <linux/sched/signal.h> 15#include <linux/slab.h> 16 17#include "hmdfs.h" 18#include "hmdfs_trace.h" 19 20/* 200ms */ 21#define HMDFS_MAX_PAUSE max((HZ / 5), 1) 22#define HMDFS_BANDWIDTH_INTERVAL max((HZ / 5), 1) 23/* Dirty type */ 24#define HMDFS_DIRTY_FS 0 25#define HMDFS_DIRTY_FILE 1 26/* Exceed flags */ 27#define HMDFS_FS_EXCEED (1 << HMDFS_DIRTY_FS) 28#define HMDFS_FILE_EXCEED (1 << HMDFS_DIRTY_FILE) 29/* Ratelimit calculate shift */ 30#define HMDFS_LIMIT_SHIFT 10 31 32void hmdfs_writeback_inodes_sb_handler(struct work_struct *work) 33{ 34 struct hmdfs_writeback *hwb = container_of( 35 work, struct hmdfs_writeback, dirty_sb_writeback_work.work); 36 37 try_to_writeback_inodes_sb(hwb->sbi->sb, WB_REASON_FS_FREE_SPACE); 38} 39 40void hmdfs_writeback_inode_handler(struct work_struct *work) 41{ 42 struct hmdfs_inode_info *info = NULL; 43 struct inode *inode = NULL; 44 struct hmdfs_writeback *hwb = container_of( 45 work, struct hmdfs_writeback, dirty_inode_writeback_work.work); 46 47 spin_lock(&hwb->inode_list_lock); 48 while (likely(!list_empty(&hwb->inode_list_head))) { 49 info = list_first_entry(&hwb->inode_list_head, 50 struct hmdfs_inode_info, wb_list); 51 list_del_init(&info->wb_list); 52 spin_unlock(&hwb->inode_list_lock); 53 54 inode = &info->vfs_inode; 55 write_inode_now(inode, 0); 56 iput(inode); 57 spin_lock(&hwb->inode_list_lock); 58 } 59 spin_unlock(&hwb->inode_list_lock); 60} 61 62static void hmdfs_writeback_inodes_sb_delayed(struct super_block *sb, 63 unsigned int delay) 64{ 65 struct hmdfs_sb_info *sbi = sb->s_fs_info; 66 unsigned long timeout; 67 68 timeout = msecs_to_jiffies(delay); 69 if (!timeout || !work_busy(&sbi->h_wb->dirty_sb_writeback_work.work)) 70 mod_delayed_work(sbi->h_wb->dirty_sb_writeback_wq, 71 &sbi->h_wb->dirty_sb_writeback_work, timeout); 72} 73 74static inline void hmdfs_writeback_inodes_sb(struct super_block *sb) 75{ 76 hmdfs_writeback_inodes_sb_delayed(sb, 0); 77} 78 79static void hmdfs_writeback_inode(struct super_block *sb, struct inode *inode) 80{ 81 struct hmdfs_sb_info *sbi = sb->s_fs_info; 82 struct hmdfs_writeback *hwb = sbi->h_wb; 83 struct hmdfs_inode_info *info = hmdfs_i(inode); 84 85 spin_lock(&hwb->inode_list_lock); 86 if (list_empty(&info->wb_list)) { 87 ihold(inode); 88 list_add_tail(&info->wb_list, &hwb->inode_list_head); 89 queue_delayed_work(hwb->dirty_inode_writeback_wq, 90 &hwb->dirty_inode_writeback_work, 0); 91 } 92 spin_unlock(&hwb->inode_list_lock); 93} 94 95static unsigned long hmdfs_idirty_pages(struct inode *inode, int tag) 96{ 97 struct pagevec pvec; 98 unsigned long nr_dirty_pages = 0; 99 pgoff_t index = 0; 100 101#if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE 102 pagevec_init(&pvec); 103#else 104 pagevec_init(&pvec, 0); 105#endif 106 while (pagevec_lookup_tag(&pvec, inode->i_mapping, &index, tag)) { 107 nr_dirty_pages += pagevec_count(&pvec); 108 pagevec_release(&pvec); 109 cond_resched(); 110 } 111 return nr_dirty_pages; 112} 113 114static inline unsigned long hmdfs_ratio_thresh(unsigned long ratio, 115 unsigned long thresh) 116{ 117 unsigned long ret = (ratio * thresh) >> HMDFS_LIMIT_SHIFT; 118 119 return (ret == 0) ? 1 : ret; 120} 121 122static inline unsigned long hmdfs_thresh_ratio(unsigned long base, 123 unsigned long thresh) 124{ 125 unsigned long ratio = (base << HMDFS_LIMIT_SHIFT) / thresh; 126 127 return (ratio == 0) ? 1 : ratio; 128} 129 130void hmdfs_calculate_dirty_thresh(struct hmdfs_writeback *hwb) 131{ 132 hwb->dirty_fs_thresh = DIV_ROUND_UP(hwb->dirty_fs_bytes, PAGE_SIZE); 133 hwb->dirty_file_thresh = DIV_ROUND_UP(hwb->dirty_file_bytes, PAGE_SIZE); 134 hwb->dirty_fs_bg_thresh = 135 DIV_ROUND_UP(hwb->dirty_fs_bg_bytes, PAGE_SIZE); 136 hwb->dirty_file_bg_thresh = 137 DIV_ROUND_UP(hwb->dirty_file_bg_bytes, PAGE_SIZE); 138 139 hwb->fs_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_fs_bg_thresh, 140 hwb->dirty_fs_thresh); 141 hwb->file_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_file_bg_thresh, 142 hwb->dirty_file_thresh); 143 hwb->fs_file_ratio = hmdfs_thresh_ratio(hwb->dirty_file_thresh, 144 hwb->dirty_fs_thresh); 145} 146 147static void hmdfs_init_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc) 148{ 149 struct hmdfs_writeback *hwb = hdtc->hwb; 150 151 hdtc->fs_thresh = hdtc->hwb->dirty_fs_thresh; 152 hdtc->file_thresh = hdtc->hwb->dirty_file_thresh; 153 hdtc->fs_bg_thresh = hdtc->hwb->dirty_fs_bg_thresh; 154 hdtc->file_bg_thresh = hdtc->hwb->dirty_file_bg_thresh; 155 156 if (!hwb->dirty_auto_threshold) 157 return; 158 159 /* 160 * Init thresh according the previous bandwidth adjusted thresh, 161 * thresh should be no more than setting thresh. 162 */ 163 if (hwb->bw_fs_thresh < hdtc->fs_thresh) { 164 hdtc->fs_thresh = hwb->bw_fs_thresh; 165 hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio, 166 hdtc->fs_thresh); 167 } 168 if (hwb->bw_file_thresh < hdtc->file_thresh) { 169 hdtc->file_thresh = hwb->bw_file_thresh; 170 hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio, 171 hdtc->file_thresh); 172 } 173 /* 174 * The thresh should be updated in the first time of dirty pages 175 * exceed the freerun ceiling. 176 */ 177 hdtc->thresh_time_stamp = jiffies - HMDFS_BANDWIDTH_INTERVAL - 1; 178} 179 180static void hmdfs_update_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc) 181{ 182 struct hmdfs_writeback *hwb = hdtc->hwb; 183 struct bdi_writeback *wb = hwb->wb; 184 unsigned int time_limit = hwb->writeback_timelimit; 185 unsigned long bw = wb->avg_write_bandwidth; 186 unsigned long thresh; 187 188 if (!hwb->dirty_auto_threshold) 189 return; 190 191 spin_lock(&hwb->write_bandwidth_lock); 192 if (bw > hwb->max_write_bandwidth) 193 hwb->max_write_bandwidth = bw; 194 195 if (bw < hwb->min_write_bandwidth) 196 hwb->min_write_bandwidth = bw; 197 hwb->avg_write_bandwidth = bw; 198 spin_unlock(&hwb->write_bandwidth_lock); 199 200 /* 201 * If the bandwidth is lower than the lower limit, it may propably 202 * offline, there is meaningless to set such a lower thresh. 203 */ 204 bw = max(bw, hwb->bw_thresh_lowerlimit); 205 thresh = bw * time_limit / roundup_pow_of_two(HZ); 206 if (thresh >= hwb->dirty_fs_thresh) { 207 hdtc->fs_thresh = hwb->dirty_fs_thresh; 208 hdtc->file_thresh = hwb->dirty_file_thresh; 209 hdtc->fs_bg_thresh = hwb->dirty_fs_bg_thresh; 210 hdtc->file_bg_thresh = hwb->dirty_file_bg_thresh; 211 } else { 212 /* Adjust thresh according to current bandwidth */ 213 hdtc->fs_thresh = thresh; 214 hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio, 215 hdtc->fs_thresh); 216 hdtc->file_thresh = hmdfs_ratio_thresh(hwb->fs_file_ratio, 217 hdtc->fs_thresh); 218 hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio, 219 hdtc->file_thresh); 220 } 221 /* Save bandwidth adjusted thresh */ 222 hwb->bw_fs_thresh = hdtc->fs_thresh; 223 hwb->bw_file_thresh = hdtc->file_thresh; 224 /* Update time stamp */ 225 hdtc->thresh_time_stamp = jiffies; 226} 227 228void hmdfs_update_ratelimit(struct hmdfs_writeback *hwb) 229{ 230 struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb}; 231 232 hmdfs_init_dirty_limit(&hdtc); 233 234 /* hdtc.file_bg_thresh should be the lowest thresh */ 235 hwb->ratelimit_pages = hdtc.file_bg_thresh / 236 (num_online_cpus() * HMDFS_RATELIMIT_PAGES_GAP); 237 if (hwb->ratelimit_pages < HMDFS_MIN_RATELIMIT_PAGES) 238 hwb->ratelimit_pages = HMDFS_MIN_RATELIMIT_PAGES; 239} 240 241/* This is a copy of wb_max_pause() */ 242static unsigned long hmdfs_wb_pause(struct bdi_writeback *wb, 243 unsigned long wb_dirty) 244{ 245 unsigned long bw = wb->avg_write_bandwidth; 246 unsigned long t; 247 248 /* 249 * Limit pause time for small memory systems. If sleeping for too long 250 * time, a small pool of dirty/writeback pages may go empty and disk go 251 * idle. 252 * 253 * 8 serves as the safety ratio. 254 */ 255 t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); 256 t++; 257 258 return min_t(unsigned long, t, HMDFS_MAX_PAUSE); 259} 260 261static unsigned long 262hmdfs_dirty_freerun_ceiling(struct hmdfs_dirty_throttle_control *hdtc, 263 unsigned int type) 264{ 265 if (type == HMDFS_DIRTY_FS) 266 return (hdtc->fs_thresh + hdtc->fs_bg_thresh) / 2; 267 else /* HMDFS_DIRTY_FILE_TYPE */ 268 return (hdtc->file_thresh + hdtc->file_bg_thresh) / 2; 269} 270 271/* This is a copy of dirty_poll_interval() */ 272static inline unsigned long hmdfs_dirty_intv(unsigned long dirty, 273 unsigned long thresh) 274{ 275 if (thresh > dirty) 276 return 1UL << (ilog2(thresh - dirty) >> 1); 277 return 1; 278} 279 280static void hmdfs_balance_dirty_pages(struct address_space *mapping) 281{ 282 struct inode *inode = mapping->host; 283 struct super_block *sb = inode->i_sb; 284 struct hmdfs_sb_info *sbi = sb->s_fs_info; 285 struct hmdfs_writeback *hwb = sbi->h_wb; 286 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 287 struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb}; 288 unsigned int dirty_exceeded = 0; 289 unsigned long start_time = jiffies; 290 unsigned long pause = 0; 291 292 /* Add delay work to trigger timeout writeback */ 293 if (hwb->dirty_writeback_interval != 0) 294 hmdfs_writeback_inodes_sb_delayed( 295 sb, hwb->dirty_writeback_interval * 10); 296 297 hmdfs_init_dirty_limit(&hdtc); 298 299 while (1) { 300 unsigned long exceed = 0; 301 unsigned long diff; 302 303 /* Per-filesystem overbalance writeback */ 304 hdtc.fs_nr_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); 305 hdtc.fs_nr_reclaimable = 306 hdtc.fs_nr_dirty + wb_stat_sum(wb, WB_WRITEBACK); 307 if (hdtc.fs_nr_reclaimable < hdtc.file_bg_thresh) { 308 diff = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable, 309 hdtc.file_thresh); 310 goto free_running; 311 } 312 313 /* Per-file overbalance writeback */ 314 hdtc.file_nr_dirty = 315 hmdfs_idirty_pages(inode, PAGECACHE_TAG_DIRTY); 316 hdtc.file_nr_reclaimable = 317 hmdfs_idirty_pages(inode, PAGECACHE_TAG_WRITEBACK) + 318 hdtc.file_nr_dirty; 319 if ((hdtc.fs_nr_reclaimable < 320 hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) && 321 (hdtc.file_nr_reclaimable < 322 hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FILE))) { 323 unsigned long fs_intv, file_intv; 324 325 fs_intv = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable, 326 hdtc.fs_thresh); 327 file_intv = hmdfs_dirty_intv(hdtc.file_nr_reclaimable, 328 hdtc.file_thresh); 329 diff = min(fs_intv, file_intv); 330free_running: 331 current->nr_dirtied_pause = diff; 332 current->nr_dirtied = 0; 333 break; 334 } 335 336 if (hdtc.fs_nr_reclaimable >= 337 hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) { 338 if (unlikely(!writeback_in_progress(wb))) 339 hmdfs_writeback_inodes_sb(sb); 340 } else { 341 hmdfs_writeback_inode(sb, inode); 342 } 343 344 /* 345 * If dirty_auto_threshold is enabled, recalculate writeback 346 * thresh according to current bandwidth. Update bandwidth 347 * could be better if possible, but wb_update_bandwidth() is 348 * not exported, so we cannot update bandwidth here, so the 349 * bandwidth' update will be delayed if writing a lot to a 350 * single file. 351 */ 352 if (hwb->dirty_auto_threshold && 353 time_is_before_jiffies(hdtc.thresh_time_stamp + 354 HMDFS_BANDWIDTH_INTERVAL)) 355 hmdfs_update_dirty_limit(&hdtc); 356 357 if (unlikely(hdtc.fs_nr_reclaimable >= hdtc.fs_thresh)) 358 exceed |= HMDFS_FS_EXCEED; 359 if (unlikely(hdtc.file_nr_reclaimable >= hdtc.file_thresh)) 360 exceed |= HMDFS_FILE_EXCEED; 361 362 if (!exceed) { 363 trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc, 364 0UL, start_time); 365 current->nr_dirtied = 0; 366 break; 367 } 368 /* 369 * Per-file or per-fs reclaimable pages exceed throttle limit, 370 * sleep pause time and check again. 371 */ 372 dirty_exceeded |= exceed; 373 if (dirty_exceeded && !hwb->dirty_exceeded) 374 hwb->dirty_exceeded = true; 375 376 /* Pause */ 377 pause = hmdfs_wb_pause(wb, hdtc.fs_nr_reclaimable); 378 379 trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc, pause, 380 start_time); 381 382 __set_current_state(TASK_KILLABLE); 383 io_schedule_timeout(pause); 384 385 if (fatal_signal_pending(current)) 386 break; 387 } 388 389 if (!dirty_exceeded && hwb->dirty_exceeded) 390 hwb->dirty_exceeded = false; 391 392 if (hdtc.fs_nr_reclaimable >= hdtc.fs_bg_thresh) { 393 if (unlikely(!writeback_in_progress(wb))) 394 hmdfs_writeback_inodes_sb(sb); 395 } else if (hdtc.file_nr_reclaimable >= hdtc.file_bg_thresh) { 396 hmdfs_writeback_inode(sb, inode); 397 } 398} 399 400void hmdfs_balance_dirty_pages_ratelimited(struct address_space *mapping) 401{ 402 struct hmdfs_sb_info *sbi = mapping->host->i_sb->s_fs_info; 403 struct hmdfs_writeback *hwb = sbi->h_wb; 404 int *bdp_ratelimits = NULL; 405 int ratelimit; 406 407 if (!hwb->dirty_writeback_control) 408 return; 409 410 /* Add delay work to trigger timeout writeback */ 411 if (hwb->dirty_writeback_interval != 0) 412 hmdfs_writeback_inodes_sb_delayed( 413 mapping->host->i_sb, 414 hwb->dirty_writeback_interval * 10); 415 416 ratelimit = current->nr_dirtied_pause; 417 if (hwb->dirty_exceeded) 418 ratelimit = min(ratelimit, HMDFS_DIRTY_EXCEED_RATELIMIT); 419 420 /* 421 * This prevents one CPU to accumulate too many dirtied pages 422 * without calling into hmdfs_balance_dirty_pages(), which can 423 * happen when there are 1000+ tasks, all of them start dirtying 424 * pages at exactly the same time, hence all honoured too large 425 * initial task->nr_dirtied_pause. 426 */ 427 preempt_disable(); 428 bdp_ratelimits = this_cpu_ptr(hwb->bdp_ratelimits); 429 430 trace_hmdfs_balance_dirty_pages_ratelimited(sbi, hwb, *bdp_ratelimits); 431 432 if (unlikely(current->nr_dirtied >= ratelimit)) { 433 *bdp_ratelimits = 0; 434 } else if (unlikely(*bdp_ratelimits >= hwb->ratelimit_pages)) { 435 *bdp_ratelimits = 0; 436 ratelimit = 0; 437 } 438 preempt_enable(); 439 440 if (unlikely(current->nr_dirtied >= ratelimit)) 441 hmdfs_balance_dirty_pages(mapping); 442} 443 444void hmdfs_destroy_writeback(struct hmdfs_sb_info *sbi) 445{ 446 if (!sbi->h_wb) 447 return; 448 449 flush_delayed_work(&sbi->h_wb->dirty_sb_writeback_work); 450 flush_delayed_work(&sbi->h_wb->dirty_inode_writeback_work); 451 destroy_workqueue(sbi->h_wb->dirty_sb_writeback_wq); 452 destroy_workqueue(sbi->h_wb->dirty_inode_writeback_wq); 453 free_percpu(sbi->h_wb->bdp_ratelimits); 454 kfree(sbi->h_wb); 455 sbi->h_wb = NULL; 456} 457 458int hmdfs_init_writeback(struct hmdfs_sb_info *sbi) 459{ 460 struct hmdfs_writeback *hwb; 461 char name[HMDFS_WQ_NAME_LEN]; 462 int ret = -ENOMEM; 463 464 hwb = kzalloc(sizeof(struct hmdfs_writeback), GFP_KERNEL); 465 if (!hwb) 466 return ret; 467 468 hwb->sbi = sbi; 469 hwb->wb = &sbi->sb->s_bdi->wb; 470 hwb->dirty_writeback_control = true; 471 hwb->dirty_writeback_interval = HM_DEFAULT_WRITEBACK_INTERVAL; 472 hwb->dirty_file_bg_bytes = HMDFS_FILE_BG_WB_BYTES; 473 hwb->dirty_fs_bg_bytes = HMDFS_FS_BG_WB_BYTES; 474 hwb->dirty_file_bytes = HMDFS_FILE_WB_BYTES; 475 hwb->dirty_fs_bytes = HMDFS_FS_WB_BYTES; 476 hmdfs_calculate_dirty_thresh(hwb); 477 hwb->bw_file_thresh = hwb->dirty_file_thresh; 478 hwb->bw_fs_thresh = hwb->dirty_fs_thresh; 479 spin_lock_init(&hwb->inode_list_lock); 480 INIT_LIST_HEAD(&hwb->inode_list_head); 481 hwb->dirty_exceeded = false; 482 hwb->ratelimit_pages = HMDFS_DEF_RATELIMIT_PAGES; 483 hwb->dirty_auto_threshold = true; 484 hwb->writeback_timelimit = HMDFS_DEF_WB_TIMELIMIT; 485 hwb->bw_thresh_lowerlimit = HMDFS_BW_THRESH_DEF_LIMIT; 486 spin_lock_init(&hwb->write_bandwidth_lock); 487 hwb->avg_write_bandwidth = 0; 488 hwb->max_write_bandwidth = 0; 489 hwb->min_write_bandwidth = ULONG_MAX; 490 hwb->bdp_ratelimits = alloc_percpu(int); 491 if (!hwb->bdp_ratelimits) 492 goto free_hwb; 493 494 snprintf(name, sizeof(name), "dfs_ino_wb%u", sbi->seq); 495 hwb->dirty_inode_writeback_wq = create_singlethread_workqueue(name); 496 if (!hwb->dirty_inode_writeback_wq) { 497 hmdfs_err("Failed to create inode writeback workqueue!"); 498 goto free_bdp; 499 } 500 snprintf(name, sizeof(name), "dfs_sb_wb%u", sbi->seq); 501 hwb->dirty_sb_writeback_wq = create_singlethread_workqueue(name); 502 if (!hwb->dirty_sb_writeback_wq) { 503 hmdfs_err("Failed to create filesystem writeback workqueue!"); 504 goto free_i_wq; 505 } 506 INIT_DELAYED_WORK(&hwb->dirty_sb_writeback_work, 507 hmdfs_writeback_inodes_sb_handler); 508 INIT_DELAYED_WORK(&hwb->dirty_inode_writeback_work, 509 hmdfs_writeback_inode_handler); 510 sbi->h_wb = hwb; 511 return 0; 512free_i_wq: 513 destroy_workqueue(hwb->dirty_inode_writeback_wq); 514free_bdp: 515 free_percpu(hwb->bdp_ratelimits); 516free_hwb: 517 kfree(hwb); 518 return ret; 519} 520