1// SPDX-License-Identifier: GPL-2.0 2/* 3 * fs/hmdfs/client_writeback.c 4 * 5 * Copyright (c) 2020-2021 Huawei Device Co., Ltd. 6 */ 7 8#include <linux/backing-dev.h> 9#include <linux/file.h> 10#include <linux/fs.h> 11#include <linux/page-flags.h> 12#include <linux/pagemap.h> 13#include <linux/pagevec.h> 14#include <linux/sched/signal.h> 15#include <linux/slab.h> 16 17#include "hmdfs.h" 18#include "hmdfs_trace.h" 19 20/* 200ms */ 21#define HMDFS_MAX_PAUSE max((HZ / 5), 1) 22#define HMDFS_BANDWIDTH_INTERVAL max((HZ / 5), 1) 23/* Dirty type */ 24#define HMDFS_DIRTY_FS 0 25#define HMDFS_DIRTY_FILE 1 26/* Exceed flags */ 27#define HMDFS_FS_EXCEED (1 << HMDFS_DIRTY_FS) 28#define HMDFS_FILE_EXCEED (1 << HMDFS_DIRTY_FILE) 29/* Ratelimit calculate shift */ 30#define HMDFS_LIMIT_SHIFT 10 31 32void hmdfs_writeback_inodes_sb_handler(struct work_struct *work) 33{ 34 struct hmdfs_writeback *hwb = container_of( 35 work, struct hmdfs_writeback, dirty_sb_writeback_work.work); 36 37 try_to_writeback_inodes_sb(hwb->sbi->sb, WB_REASON_FS_FREE_SPACE); 38} 39 40void hmdfs_writeback_inode_handler(struct work_struct *work) 41{ 42 struct hmdfs_inode_info *info = NULL; 43 struct inode *inode = NULL; 44 struct hmdfs_writeback *hwb = container_of( 45 work, struct hmdfs_writeback, dirty_inode_writeback_work.work); 46 47 spin_lock(&hwb->inode_list_lock); 48 while (likely(!list_empty(&hwb->inode_list_head))) { 49 info = list_first_entry(&hwb->inode_list_head, 50 struct hmdfs_inode_info, wb_list); 51 list_del_init(&info->wb_list); 52 spin_unlock(&hwb->inode_list_lock); 53 54 inode = &info->vfs_inode; 55 write_inode_now(inode, 0); 56 iput(inode); 57 spin_lock(&hwb->inode_list_lock); 58 } 59 spin_unlock(&hwb->inode_list_lock); 60} 61 62static void hmdfs_writeback_inodes_sb_delayed(struct super_block *sb, 63 unsigned int delay) 64{ 65 struct hmdfs_sb_info *sbi = sb->s_fs_info; 66 unsigned long timeout; 67 68 timeout = msecs_to_jiffies(delay); 69 if (!timeout || !work_busy(&sbi->h_wb->dirty_sb_writeback_work.work)) 70 mod_delayed_work(sbi->h_wb->dirty_sb_writeback_wq, 71 &sbi->h_wb->dirty_sb_writeback_work, timeout); 72} 73 74static inline void hmdfs_writeback_inodes_sb(struct super_block *sb) 75{ 76 hmdfs_writeback_inodes_sb_delayed(sb, 0); 77} 78 79static void hmdfs_writeback_inode(struct super_block *sb, struct inode *inode) 80{ 81 struct hmdfs_sb_info *sbi = sb->s_fs_info; 82 struct hmdfs_writeback *hwb = sbi->h_wb; 83 struct hmdfs_inode_info *info = hmdfs_i(inode); 84 85 spin_lock(&hwb->inode_list_lock); 86 if (list_empty(&info->wb_list)) { 87 ihold(inode); 88 list_add_tail(&info->wb_list, &hwb->inode_list_head); 89 queue_delayed_work(hwb->dirty_inode_writeback_wq, 90 &hwb->dirty_inode_writeback_work, 0); 91 } 92 spin_unlock(&hwb->inode_list_lock); 93} 94 95static unsigned long hmdfs_idirty_pages(struct inode *inode, int tag) 96{ 97#if KERNEL_VERSION(6, 3, 0) <= LINUX_VERSION_CODE 98 struct folio_batch fbatch; 99#else 100 struct pagevec pvec; 101#endif 102 unsigned long nr_dirty_pages = 0; 103 pgoff_t index = 0; 104 105#if KERNEL_VERSION(4, 15, 0) <= LINUX_VERSION_CODE 106#if KERNEL_VERSION(6, 3, 0) <= LINUX_VERSION_CODE 107 folio_batch_init(&fbatch); 108#else 109 pagevec_init(&pvec); 110#endif 111#else 112 pagevec_init(&pvec, 0); 113#endif 114 115#if KERNEL_VERSION(6, 3, 0) <= LINUX_VERSION_CODE 116 while (filemap_get_folios_tag(inode->i_mapping, &index, 117 (pgoff_t)-1, tag, &fbatch)) { 118 for (int i = 0; i < fbatch.nr; i++) { 119 struct folio *folio = fbatch.folios[i]; 120 if (folio_test_dirty(folio) || folio_test_writeback(folio)) { 121 nr_dirty_pages++; 122 } 123 } 124 folio_batch_release(&fbatch); 125 cond_resched(); 126 } 127#else 128 while (pagevec_lookup_tag(&pvec, inode->i_mapping, &index, tag)) { 129 nr_dirty_pages += pagevec_count(&pvec); 130 pagevec_release(&pvec); 131 cond_resched(); 132 } 133#endif 134 135 return nr_dirty_pages; 136} 137 138static inline unsigned long hmdfs_ratio_thresh(unsigned long ratio, 139 unsigned long thresh) 140{ 141 unsigned long ret = (ratio * thresh) >> HMDFS_LIMIT_SHIFT; 142 143 return (ret == 0) ? 1 : ret; 144} 145 146static inline unsigned long hmdfs_thresh_ratio(unsigned long base, 147 unsigned long thresh) 148{ 149 unsigned long ratio = (base << HMDFS_LIMIT_SHIFT) / thresh; 150 151 return (ratio == 0) ? 1 : ratio; 152} 153 154void hmdfs_calculate_dirty_thresh(struct hmdfs_writeback *hwb) 155{ 156 hwb->dirty_fs_thresh = DIV_ROUND_UP(hwb->dirty_fs_bytes, PAGE_SIZE); 157 hwb->dirty_file_thresh = DIV_ROUND_UP(hwb->dirty_file_bytes, PAGE_SIZE); 158 hwb->dirty_fs_bg_thresh = 159 DIV_ROUND_UP(hwb->dirty_fs_bg_bytes, PAGE_SIZE); 160 hwb->dirty_file_bg_thresh = 161 DIV_ROUND_UP(hwb->dirty_file_bg_bytes, PAGE_SIZE); 162 163 hwb->fs_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_fs_bg_thresh, 164 hwb->dirty_fs_thresh); 165 hwb->file_bg_ratio = hmdfs_thresh_ratio(hwb->dirty_file_bg_thresh, 166 hwb->dirty_file_thresh); 167 hwb->fs_file_ratio = hmdfs_thresh_ratio(hwb->dirty_file_thresh, 168 hwb->dirty_fs_thresh); 169} 170 171static void hmdfs_init_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc) 172{ 173 struct hmdfs_writeback *hwb = hdtc->hwb; 174 175 hdtc->fs_thresh = hdtc->hwb->dirty_fs_thresh; 176 hdtc->file_thresh = hdtc->hwb->dirty_file_thresh; 177 hdtc->fs_bg_thresh = hdtc->hwb->dirty_fs_bg_thresh; 178 hdtc->file_bg_thresh = hdtc->hwb->dirty_file_bg_thresh; 179 180 if (!hwb->dirty_auto_threshold) 181 return; 182 183 /* 184 * Init thresh according the previous bandwidth adjusted thresh, 185 * thresh should be no more than setting thresh. 186 */ 187 if (hwb->bw_fs_thresh < hdtc->fs_thresh) { 188 hdtc->fs_thresh = hwb->bw_fs_thresh; 189 hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio, 190 hdtc->fs_thresh); 191 } 192 if (hwb->bw_file_thresh < hdtc->file_thresh) { 193 hdtc->file_thresh = hwb->bw_file_thresh; 194 hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio, 195 hdtc->file_thresh); 196 } 197 /* 198 * The thresh should be updated in the first time of dirty pages 199 * exceed the freerun ceiling. 200 */ 201 hdtc->thresh_time_stamp = jiffies - HMDFS_BANDWIDTH_INTERVAL - 1; 202} 203 204static void hmdfs_update_dirty_limit(struct hmdfs_dirty_throttle_control *hdtc) 205{ 206 struct hmdfs_writeback *hwb = hdtc->hwb; 207 struct bdi_writeback *wb = hwb->wb; 208 unsigned int time_limit = hwb->writeback_timelimit; 209 unsigned long bw = wb->avg_write_bandwidth; 210 unsigned long thresh; 211 212 if (!hwb->dirty_auto_threshold) 213 return; 214 215 spin_lock(&hwb->write_bandwidth_lock); 216 if (bw > hwb->max_write_bandwidth) 217 hwb->max_write_bandwidth = bw; 218 219 if (bw < hwb->min_write_bandwidth) 220 hwb->min_write_bandwidth = bw; 221 hwb->avg_write_bandwidth = bw; 222 spin_unlock(&hwb->write_bandwidth_lock); 223 224 /* 225 * If the bandwidth is lower than the lower limit, it may propably 226 * offline, there is meaningless to set such a lower thresh. 227 */ 228 bw = max(bw, hwb->bw_thresh_lowerlimit); 229 thresh = bw * time_limit / roundup_pow_of_two(HZ); 230 if (thresh >= hwb->dirty_fs_thresh) { 231 hdtc->fs_thresh = hwb->dirty_fs_thresh; 232 hdtc->file_thresh = hwb->dirty_file_thresh; 233 hdtc->fs_bg_thresh = hwb->dirty_fs_bg_thresh; 234 hdtc->file_bg_thresh = hwb->dirty_file_bg_thresh; 235 } else { 236 /* Adjust thresh according to current bandwidth */ 237 hdtc->fs_thresh = thresh; 238 hdtc->fs_bg_thresh = hmdfs_ratio_thresh(hwb->fs_bg_ratio, 239 hdtc->fs_thresh); 240 hdtc->file_thresh = hmdfs_ratio_thresh(hwb->fs_file_ratio, 241 hdtc->fs_thresh); 242 hdtc->file_bg_thresh = hmdfs_ratio_thresh(hwb->file_bg_ratio, 243 hdtc->file_thresh); 244 } 245 /* Save bandwidth adjusted thresh */ 246 hwb->bw_fs_thresh = hdtc->fs_thresh; 247 hwb->bw_file_thresh = hdtc->file_thresh; 248 /* Update time stamp */ 249 hdtc->thresh_time_stamp = jiffies; 250} 251 252void hmdfs_update_ratelimit(struct hmdfs_writeback *hwb) 253{ 254 struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb}; 255 256 hmdfs_init_dirty_limit(&hdtc); 257 258 /* hdtc.file_bg_thresh should be the lowest thresh */ 259 hwb->ratelimit_pages = hdtc.file_bg_thresh / 260 (num_online_cpus() * HMDFS_RATELIMIT_PAGES_GAP); 261 if (hwb->ratelimit_pages < HMDFS_MIN_RATELIMIT_PAGES) 262 hwb->ratelimit_pages = HMDFS_MIN_RATELIMIT_PAGES; 263} 264 265/* This is a copy of wb_max_pause() */ 266static unsigned long hmdfs_wb_pause(struct bdi_writeback *wb, 267 unsigned long wb_dirty) 268{ 269 unsigned long bw = wb->avg_write_bandwidth; 270 unsigned long t; 271 272 /* 273 * Limit pause time for small memory systems. If sleeping for too long 274 * time, a small pool of dirty/writeback pages may go empty and disk go 275 * idle. 276 * 277 * 8 serves as the safety ratio. 278 */ 279 t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); 280 t++; 281 282 return min_t(unsigned long, t, HMDFS_MAX_PAUSE); 283} 284 285static unsigned long 286hmdfs_dirty_freerun_ceiling(struct hmdfs_dirty_throttle_control *hdtc, 287 unsigned int type) 288{ 289 if (type == HMDFS_DIRTY_FS) 290 return (hdtc->fs_thresh + hdtc->fs_bg_thresh) / 2; 291 else /* HMDFS_DIRTY_FILE_TYPE */ 292 return (hdtc->file_thresh + hdtc->file_bg_thresh) / 2; 293} 294 295/* This is a copy of dirty_poll_interval() */ 296static inline unsigned long hmdfs_dirty_intv(unsigned long dirty, 297 unsigned long thresh) 298{ 299 if (thresh > dirty) 300 return 1UL << (ilog2(thresh - dirty) >> 1); 301 return 1; 302} 303 304static void hmdfs_balance_dirty_pages(struct address_space *mapping) 305{ 306 struct inode *inode = mapping->host; 307 struct super_block *sb = inode->i_sb; 308 struct hmdfs_sb_info *sbi = sb->s_fs_info; 309 struct hmdfs_writeback *hwb = sbi->h_wb; 310 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 311 struct hmdfs_dirty_throttle_control hdtc = {.hwb = hwb}; 312 unsigned int dirty_exceeded = 0; 313 unsigned long start_time = jiffies; 314 unsigned long pause = 0; 315 316 /* Add delay work to trigger timeout writeback */ 317 if (hwb->dirty_writeback_interval != 0) 318 hmdfs_writeback_inodes_sb_delayed( 319 sb, hwb->dirty_writeback_interval * 10); 320 321 hmdfs_init_dirty_limit(&hdtc); 322 323 while (1) { 324 unsigned long exceed = 0; 325 unsigned long diff; 326 327 /* Per-filesystem overbalance writeback */ 328 hdtc.fs_nr_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); 329 hdtc.fs_nr_reclaimable = 330 hdtc.fs_nr_dirty + wb_stat_sum(wb, WB_WRITEBACK); 331 if (hdtc.fs_nr_reclaimable < hdtc.file_bg_thresh) { 332 diff = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable, 333 hdtc.file_thresh); 334 goto free_running; 335 } 336 337 /* Per-file overbalance writeback */ 338 hdtc.file_nr_dirty = 339 hmdfs_idirty_pages(inode, PAGECACHE_TAG_DIRTY); 340 hdtc.file_nr_reclaimable = 341 hmdfs_idirty_pages(inode, PAGECACHE_TAG_WRITEBACK) + 342 hdtc.file_nr_dirty; 343 if ((hdtc.fs_nr_reclaimable < 344 hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) && 345 (hdtc.file_nr_reclaimable < 346 hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FILE))) { 347 unsigned long fs_intv, file_intv; 348 349 fs_intv = hmdfs_dirty_intv(hdtc.fs_nr_reclaimable, 350 hdtc.fs_thresh); 351 file_intv = hmdfs_dirty_intv(hdtc.file_nr_reclaimable, 352 hdtc.file_thresh); 353 diff = min(fs_intv, file_intv); 354free_running: 355 current->nr_dirtied_pause = diff; 356 current->nr_dirtied = 0; 357 break; 358 } 359 360 if (hdtc.fs_nr_reclaimable >= 361 hmdfs_dirty_freerun_ceiling(&hdtc, HMDFS_DIRTY_FS)) { 362 if (unlikely(!writeback_in_progress(wb))) 363 hmdfs_writeback_inodes_sb(sb); 364 } else { 365 hmdfs_writeback_inode(sb, inode); 366 } 367 368 /* 369 * If dirty_auto_threshold is enabled, recalculate writeback 370 * thresh according to current bandwidth. Update bandwidth 371 * could be better if possible, but wb_update_bandwidth() is 372 * not exported, so we cannot update bandwidth here, so the 373 * bandwidth' update will be delayed if writing a lot to a 374 * single file. 375 */ 376 if (hwb->dirty_auto_threshold && 377 time_is_before_jiffies(hdtc.thresh_time_stamp + 378 HMDFS_BANDWIDTH_INTERVAL)) 379 hmdfs_update_dirty_limit(&hdtc); 380 381 if (unlikely(hdtc.fs_nr_reclaimable >= hdtc.fs_thresh)) 382 exceed |= HMDFS_FS_EXCEED; 383 if (unlikely(hdtc.file_nr_reclaimable >= hdtc.file_thresh)) 384 exceed |= HMDFS_FILE_EXCEED; 385 386 if (!exceed) { 387 trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc, 388 0UL, start_time); 389 current->nr_dirtied = 0; 390 break; 391 } 392 /* 393 * Per-file or per-fs reclaimable pages exceed throttle limit, 394 * sleep pause time and check again. 395 */ 396 dirty_exceeded |= exceed; 397 if (dirty_exceeded && !hwb->dirty_exceeded) 398 hwb->dirty_exceeded = true; 399 400 /* Pause */ 401 pause = hmdfs_wb_pause(wb, hdtc.fs_nr_reclaimable); 402 403 trace_hmdfs_balance_dirty_pages(sbi, wb, &hdtc, pause, 404 start_time); 405 406 __set_current_state(TASK_KILLABLE); 407 io_schedule_timeout(pause); 408 409 if (fatal_signal_pending(current)) 410 break; 411 } 412 413 if (!dirty_exceeded && hwb->dirty_exceeded) 414 hwb->dirty_exceeded = false; 415 416 if (hdtc.fs_nr_reclaimable >= hdtc.fs_bg_thresh) { 417 if (unlikely(!writeback_in_progress(wb))) 418 hmdfs_writeback_inodes_sb(sb); 419 } else if (hdtc.file_nr_reclaimable >= hdtc.file_bg_thresh) { 420 hmdfs_writeback_inode(sb, inode); 421 } 422} 423 424void hmdfs_balance_dirty_pages_ratelimited(struct address_space *mapping) 425{ 426 struct hmdfs_sb_info *sbi = mapping->host->i_sb->s_fs_info; 427 struct hmdfs_writeback *hwb = sbi->h_wb; 428 int *bdp_ratelimits = NULL; 429 int ratelimit; 430 431 if (!hwb->dirty_writeback_control) 432 return; 433 434 /* Add delay work to trigger timeout writeback */ 435 if (hwb->dirty_writeback_interval != 0) 436 hmdfs_writeback_inodes_sb_delayed( 437 mapping->host->i_sb, 438 hwb->dirty_writeback_interval * 10); 439 440 ratelimit = current->nr_dirtied_pause; 441 if (hwb->dirty_exceeded) 442 ratelimit = min(ratelimit, HMDFS_DIRTY_EXCEED_RATELIMIT); 443 444 /* 445 * This prevents one CPU to accumulate too many dirtied pages 446 * without calling into hmdfs_balance_dirty_pages(), which can 447 * happen when there are 1000+ tasks, all of them start dirtying 448 * pages at exactly the same time, hence all honoured too large 449 * initial task->nr_dirtied_pause. 450 */ 451 preempt_disable(); 452 bdp_ratelimits = this_cpu_ptr(hwb->bdp_ratelimits); 453 454 trace_hmdfs_balance_dirty_pages_ratelimited(sbi, hwb, *bdp_ratelimits); 455 456 if (unlikely(current->nr_dirtied >= ratelimit)) { 457 *bdp_ratelimits = 0; 458 } else if (unlikely(*bdp_ratelimits >= hwb->ratelimit_pages)) { 459 *bdp_ratelimits = 0; 460 ratelimit = 0; 461 } 462 preempt_enable(); 463 464 if (unlikely(current->nr_dirtied >= ratelimit)) 465 hmdfs_balance_dirty_pages(mapping); 466} 467 468void hmdfs_destroy_writeback(struct hmdfs_sb_info *sbi) 469{ 470 if (!sbi->h_wb) 471 return; 472 473 flush_delayed_work(&sbi->h_wb->dirty_sb_writeback_work); 474 flush_delayed_work(&sbi->h_wb->dirty_inode_writeback_work); 475 destroy_workqueue(sbi->h_wb->dirty_sb_writeback_wq); 476 destroy_workqueue(sbi->h_wb->dirty_inode_writeback_wq); 477 free_percpu(sbi->h_wb->bdp_ratelimits); 478 kfree(sbi->h_wb); 479 sbi->h_wb = NULL; 480} 481 482int hmdfs_init_writeback(struct hmdfs_sb_info *sbi) 483{ 484 struct hmdfs_writeback *hwb; 485 char name[HMDFS_WQ_NAME_LEN]; 486 int ret = -ENOMEM; 487 488 hwb = kzalloc(sizeof(struct hmdfs_writeback), GFP_KERNEL); 489 if (!hwb) 490 return ret; 491 492 hwb->sbi = sbi; 493 hwb->wb = &sbi->sb->s_bdi->wb; 494 hwb->dirty_writeback_control = true; 495 hwb->dirty_writeback_interval = HM_DEFAULT_WRITEBACK_INTERVAL; 496 hwb->dirty_file_bg_bytes = HMDFS_FILE_BG_WB_BYTES; 497 hwb->dirty_fs_bg_bytes = HMDFS_FS_BG_WB_BYTES; 498 hwb->dirty_file_bytes = HMDFS_FILE_WB_BYTES; 499 hwb->dirty_fs_bytes = HMDFS_FS_WB_BYTES; 500 hmdfs_calculate_dirty_thresh(hwb); 501 hwb->bw_file_thresh = hwb->dirty_file_thresh; 502 hwb->bw_fs_thresh = hwb->dirty_fs_thresh; 503 spin_lock_init(&hwb->inode_list_lock); 504 INIT_LIST_HEAD(&hwb->inode_list_head); 505 hwb->dirty_exceeded = false; 506 hwb->ratelimit_pages = HMDFS_DEF_RATELIMIT_PAGES; 507 hwb->dirty_auto_threshold = true; 508 hwb->writeback_timelimit = HMDFS_DEF_WB_TIMELIMIT; 509 hwb->bw_thresh_lowerlimit = HMDFS_BW_THRESH_DEF_LIMIT; 510 spin_lock_init(&hwb->write_bandwidth_lock); 511 hwb->avg_write_bandwidth = 0; 512 hwb->max_write_bandwidth = 0; 513 hwb->min_write_bandwidth = ULONG_MAX; 514 hwb->bdp_ratelimits = alloc_percpu(int); 515 if (!hwb->bdp_ratelimits) 516 goto free_hwb; 517 518 snprintf(name, sizeof(name), "dfs_ino_wb%u", sbi->seq); 519 hwb->dirty_inode_writeback_wq = create_singlethread_workqueue(name); 520 if (!hwb->dirty_inode_writeback_wq) { 521 hmdfs_err("Failed to create inode writeback workqueue!"); 522 goto free_bdp; 523 } 524 snprintf(name, sizeof(name), "dfs_sb_wb%u", sbi->seq); 525 hwb->dirty_sb_writeback_wq = create_singlethread_workqueue(name); 526 if (!hwb->dirty_sb_writeback_wq) { 527 hmdfs_err("Failed to create filesystem writeback workqueue!"); 528 goto free_i_wq; 529 } 530 INIT_DELAYED_WORK(&hwb->dirty_sb_writeback_work, 531 hmdfs_writeback_inodes_sb_handler); 532 INIT_DELAYED_WORK(&hwb->dirty_inode_writeback_work, 533 hmdfs_writeback_inode_handler); 534 sbi->h_wb = hwb; 535 return 0; 536free_i_wq: 537 destroy_workqueue(hwb->dirty_inode_writeback_wq); 538free_bdp: 539 free_percpu(hwb->bdp_ratelimits); 540free_hwb: 541 kfree(hwb); 542 return ret; 543} 544