18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * fs/fs-writeback.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2002, Linus Torvalds. 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Contains all the functions related to writing back and waiting 88c2ecf20Sopenharmony_ci * upon dirty inodes against superblocks, and writing back dirty 98c2ecf20Sopenharmony_ci * pages against inodes. ie: data writeback. Writeout of the 108c2ecf20Sopenharmony_ci * inode itself is not handled here. 118c2ecf20Sopenharmony_ci * 128c2ecf20Sopenharmony_ci * 10Apr2002 Andrew Morton 138c2ecf20Sopenharmony_ci * Split out of fs/inode.c 148c2ecf20Sopenharmony_ci * Additions for address_space-based writeback 158c2ecf20Sopenharmony_ci */ 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci#include <linux/kernel.h> 188c2ecf20Sopenharmony_ci#include <linux/export.h> 198c2ecf20Sopenharmony_ci#include <linux/spinlock.h> 208c2ecf20Sopenharmony_ci#include <linux/slab.h> 218c2ecf20Sopenharmony_ci#include <linux/sched.h> 228c2ecf20Sopenharmony_ci#include <linux/fs.h> 238c2ecf20Sopenharmony_ci#include <linux/mm.h> 248c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 258c2ecf20Sopenharmony_ci#include <linux/kthread.h> 268c2ecf20Sopenharmony_ci#include <linux/writeback.h> 278c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 288c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 298c2ecf20Sopenharmony_ci#include <linux/tracepoint.h> 308c2ecf20Sopenharmony_ci#include <linux/device.h> 318c2ecf20Sopenharmony_ci#include <linux/memcontrol.h> 328c2ecf20Sopenharmony_ci#include "internal.h" 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci/* 358c2ecf20Sopenharmony_ci * 4MB minimal write chunk size 368c2ecf20Sopenharmony_ci */ 378c2ecf20Sopenharmony_ci#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ci/* 408c2ecf20Sopenharmony_ci * Passed into wb_writeback(), essentially a subset of writeback_control 418c2ecf20Sopenharmony_ci */ 428c2ecf20Sopenharmony_cistruct wb_writeback_work { 438c2ecf20Sopenharmony_ci long nr_pages; 448c2ecf20Sopenharmony_ci struct super_block *sb; 458c2ecf20Sopenharmony_ci enum writeback_sync_modes sync_mode; 468c2ecf20Sopenharmony_ci unsigned int tagged_writepages:1; 478c2ecf20Sopenharmony_ci unsigned int for_kupdate:1; 488c2ecf20Sopenharmony_ci unsigned int range_cyclic:1; 498c2ecf20Sopenharmony_ci unsigned int for_background:1; 508c2ecf20Sopenharmony_ci unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 518c2ecf20Sopenharmony_ci unsigned int auto_free:1; /* free on completion */ 528c2ecf20Sopenharmony_ci enum wb_reason reason; /* why was writeback initiated? */ 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci struct list_head list; /* pending work list */ 558c2ecf20Sopenharmony_ci struct wb_completion *done; /* set if the caller waits */ 568c2ecf20Sopenharmony_ci}; 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_ci/* 598c2ecf20Sopenharmony_ci * If an inode is constantly having its pages dirtied, but then the 608c2ecf20Sopenharmony_ci * updates stop dirtytime_expire_interval seconds in the past, it's 618c2ecf20Sopenharmony_ci * possible for the worst case time between when an inode has its 628c2ecf20Sopenharmony_ci * timestamps updated and when they finally get written out to be two 638c2ecf20Sopenharmony_ci * dirtytime_expire_intervals. We set the default to 12 hours (in 648c2ecf20Sopenharmony_ci * seconds), which means most of the time inodes will have their 658c2ecf20Sopenharmony_ci * timestamps written to disk after 12 hours, but in the worst case a 668c2ecf20Sopenharmony_ci * few inodes might not their timestamps updated for 24 hours. 678c2ecf20Sopenharmony_ci */ 688c2ecf20Sopenharmony_ciunsigned int dirtytime_expire_interval = 12 * 60 * 60; 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_cistatic inline struct inode *wb_inode(struct list_head *head) 718c2ecf20Sopenharmony_ci{ 728c2ecf20Sopenharmony_ci return list_entry(head, struct inode, i_io_list); 738c2ecf20Sopenharmony_ci} 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci/* 768c2ecf20Sopenharmony_ci * Include the creation of the trace points after defining the 778c2ecf20Sopenharmony_ci * wb_writeback_work structure and inline functions so that the definition 788c2ecf20Sopenharmony_ci * remains local to this file. 798c2ecf20Sopenharmony_ci */ 808c2ecf20Sopenharmony_ci#define CREATE_TRACE_POINTS 818c2ecf20Sopenharmony_ci#include <trace/events/writeback.h> 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ciEXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_cistatic bool wb_io_lists_populated(struct bdi_writeback *wb) 868c2ecf20Sopenharmony_ci{ 878c2ecf20Sopenharmony_ci if (wb_has_dirty_io(wb)) { 888c2ecf20Sopenharmony_ci return false; 898c2ecf20Sopenharmony_ci } else { 908c2ecf20Sopenharmony_ci set_bit(WB_has_dirty_io, &wb->state); 918c2ecf20Sopenharmony_ci WARN_ON_ONCE(!wb->avg_write_bandwidth); 928c2ecf20Sopenharmony_ci atomic_long_add(wb->avg_write_bandwidth, 938c2ecf20Sopenharmony_ci &wb->bdi->tot_write_bandwidth); 948c2ecf20Sopenharmony_ci return true; 958c2ecf20Sopenharmony_ci } 968c2ecf20Sopenharmony_ci} 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_cistatic void wb_io_lists_depopulated(struct bdi_writeback *wb) 998c2ecf20Sopenharmony_ci{ 1008c2ecf20Sopenharmony_ci if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && 1018c2ecf20Sopenharmony_ci list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { 1028c2ecf20Sopenharmony_ci clear_bit(WB_has_dirty_io, &wb->state); 1038c2ecf20Sopenharmony_ci WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, 1048c2ecf20Sopenharmony_ci &wb->bdi->tot_write_bandwidth) < 0); 1058c2ecf20Sopenharmony_ci } 1068c2ecf20Sopenharmony_ci} 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci/** 1098c2ecf20Sopenharmony_ci * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list 1108c2ecf20Sopenharmony_ci * @inode: inode to be moved 1118c2ecf20Sopenharmony_ci * @wb: target bdi_writeback 1128c2ecf20Sopenharmony_ci * @head: one of @wb->b_{dirty|io|more_io|dirty_time} 1138c2ecf20Sopenharmony_ci * 1148c2ecf20Sopenharmony_ci * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. 1158c2ecf20Sopenharmony_ci * Returns %true if @inode is the first occupant of the !dirty_time IO 1168c2ecf20Sopenharmony_ci * lists; otherwise, %false. 1178c2ecf20Sopenharmony_ci */ 1188c2ecf20Sopenharmony_cistatic bool inode_io_list_move_locked(struct inode *inode, 1198c2ecf20Sopenharmony_ci struct bdi_writeback *wb, 1208c2ecf20Sopenharmony_ci struct list_head *head) 1218c2ecf20Sopenharmony_ci{ 1228c2ecf20Sopenharmony_ci assert_spin_locked(&wb->list_lock); 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci list_move(&inode->i_io_list, head); 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci /* dirty_time doesn't count as dirty_io until expiration */ 1278c2ecf20Sopenharmony_ci if (head != &wb->b_dirty_time) 1288c2ecf20Sopenharmony_ci return wb_io_lists_populated(wb); 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci wb_io_lists_depopulated(wb); 1318c2ecf20Sopenharmony_ci return false; 1328c2ecf20Sopenharmony_ci} 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci/** 1358c2ecf20Sopenharmony_ci * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list 1368c2ecf20Sopenharmony_ci * @inode: inode to be removed 1378c2ecf20Sopenharmony_ci * @wb: bdi_writeback @inode is being removed from 1388c2ecf20Sopenharmony_ci * 1398c2ecf20Sopenharmony_ci * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and 1408c2ecf20Sopenharmony_ci * clear %WB_has_dirty_io if all are empty afterwards. 1418c2ecf20Sopenharmony_ci */ 1428c2ecf20Sopenharmony_cistatic void inode_io_list_del_locked(struct inode *inode, 1438c2ecf20Sopenharmony_ci struct bdi_writeback *wb) 1448c2ecf20Sopenharmony_ci{ 1458c2ecf20Sopenharmony_ci assert_spin_locked(&wb->list_lock); 1468c2ecf20Sopenharmony_ci assert_spin_locked(&inode->i_lock); 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci inode->i_state &= ~I_SYNC_QUEUED; 1498c2ecf20Sopenharmony_ci list_del_init(&inode->i_io_list); 1508c2ecf20Sopenharmony_ci wb_io_lists_depopulated(wb); 1518c2ecf20Sopenharmony_ci} 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_cistatic void wb_wakeup(struct bdi_writeback *wb) 1548c2ecf20Sopenharmony_ci{ 1558c2ecf20Sopenharmony_ci spin_lock_bh(&wb->work_lock); 1568c2ecf20Sopenharmony_ci if (test_bit(WB_registered, &wb->state)) 1578c2ecf20Sopenharmony_ci mod_delayed_work(bdi_wq, &wb->dwork, 0); 1588c2ecf20Sopenharmony_ci spin_unlock_bh(&wb->work_lock); 1598c2ecf20Sopenharmony_ci} 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_cistatic void finish_writeback_work(struct bdi_writeback *wb, 1628c2ecf20Sopenharmony_ci struct wb_writeback_work *work) 1638c2ecf20Sopenharmony_ci{ 1648c2ecf20Sopenharmony_ci struct wb_completion *done = work->done; 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci if (work->auto_free) 1678c2ecf20Sopenharmony_ci kfree(work); 1688c2ecf20Sopenharmony_ci if (done) { 1698c2ecf20Sopenharmony_ci wait_queue_head_t *waitq = done->waitq; 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci /* @done can't be accessed after the following dec */ 1728c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&done->cnt)) 1738c2ecf20Sopenharmony_ci wake_up_all(waitq); 1748c2ecf20Sopenharmony_ci } 1758c2ecf20Sopenharmony_ci} 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_cistatic void wb_queue_work(struct bdi_writeback *wb, 1788c2ecf20Sopenharmony_ci struct wb_writeback_work *work) 1798c2ecf20Sopenharmony_ci{ 1808c2ecf20Sopenharmony_ci trace_writeback_queue(wb, work); 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci if (work->done) 1838c2ecf20Sopenharmony_ci atomic_inc(&work->done->cnt); 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci spin_lock_bh(&wb->work_lock); 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci if (test_bit(WB_registered, &wb->state)) { 1888c2ecf20Sopenharmony_ci list_add_tail(&work->list, &wb->work_list); 1898c2ecf20Sopenharmony_ci mod_delayed_work(bdi_wq, &wb->dwork, 0); 1908c2ecf20Sopenharmony_ci } else 1918c2ecf20Sopenharmony_ci finish_writeback_work(wb, work); 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci spin_unlock_bh(&wb->work_lock); 1948c2ecf20Sopenharmony_ci} 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci/** 1978c2ecf20Sopenharmony_ci * wb_wait_for_completion - wait for completion of bdi_writeback_works 1988c2ecf20Sopenharmony_ci * @done: target wb_completion 1998c2ecf20Sopenharmony_ci * 2008c2ecf20Sopenharmony_ci * Wait for one or more work items issued to @bdi with their ->done field 2018c2ecf20Sopenharmony_ci * set to @done, which should have been initialized with 2028c2ecf20Sopenharmony_ci * DEFINE_WB_COMPLETION(). This function returns after all such work items 2038c2ecf20Sopenharmony_ci * are completed. Work items which are waited upon aren't freed 2048c2ecf20Sopenharmony_ci * automatically on completion. 2058c2ecf20Sopenharmony_ci */ 2068c2ecf20Sopenharmony_civoid wb_wait_for_completion(struct wb_completion *done) 2078c2ecf20Sopenharmony_ci{ 2088c2ecf20Sopenharmony_ci atomic_dec(&done->cnt); /* put down the initial count */ 2098c2ecf20Sopenharmony_ci wait_event(*done->waitq, !atomic_read(&done->cnt)); 2108c2ecf20Sopenharmony_ci} 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci/* 2158c2ecf20Sopenharmony_ci * Parameters for foreign inode detection, see wbc_detach_inode() to see 2168c2ecf20Sopenharmony_ci * how they're used. 2178c2ecf20Sopenharmony_ci * 2188c2ecf20Sopenharmony_ci * These paramters are inherently heuristical as the detection target 2198c2ecf20Sopenharmony_ci * itself is fuzzy. All we want to do is detaching an inode from the 2208c2ecf20Sopenharmony_ci * current owner if it's being written to by some other cgroups too much. 2218c2ecf20Sopenharmony_ci * 2228c2ecf20Sopenharmony_ci * The current cgroup writeback is built on the assumption that multiple 2238c2ecf20Sopenharmony_ci * cgroups writing to the same inode concurrently is very rare and a mode 2248c2ecf20Sopenharmony_ci * of operation which isn't well supported. As such, the goal is not 2258c2ecf20Sopenharmony_ci * taking too long when a different cgroup takes over an inode while 2268c2ecf20Sopenharmony_ci * avoiding too aggressive flip-flops from occasional foreign writes. 2278c2ecf20Sopenharmony_ci * 2288c2ecf20Sopenharmony_ci * We record, very roughly, 2s worth of IO time history and if more than 2298c2ecf20Sopenharmony_ci * half of that is foreign, trigger the switch. The recording is quantized 2308c2ecf20Sopenharmony_ci * to 16 slots. To avoid tiny writes from swinging the decision too much, 2318c2ecf20Sopenharmony_ci * writes smaller than 1/8 of avg size are ignored. 2328c2ecf20Sopenharmony_ci */ 2338c2ecf20Sopenharmony_ci#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ 2348c2ecf20Sopenharmony_ci#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ 2358c2ecf20Sopenharmony_ci#define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */ 2368c2ecf20Sopenharmony_ci#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ 2398c2ecf20Sopenharmony_ci#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) 2408c2ecf20Sopenharmony_ci /* each slot's duration is 2s / 16 */ 2418c2ecf20Sopenharmony_ci#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) 2428c2ecf20Sopenharmony_ci /* if foreign slots >= 8, switch */ 2438c2ecf20Sopenharmony_ci#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) 2448c2ecf20Sopenharmony_ci /* one round can affect upto 5 slots */ 2458c2ecf20Sopenharmony_ci#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_cistatic atomic_t isw_nr_in_flight = ATOMIC_INIT(0); 2488c2ecf20Sopenharmony_cistatic struct workqueue_struct *isw_wq; 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_civoid __inode_attach_wb(struct inode *inode, struct page *page) 2518c2ecf20Sopenharmony_ci{ 2528c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(inode); 2538c2ecf20Sopenharmony_ci struct bdi_writeback *wb = NULL; 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci if (inode_cgwb_enabled(inode)) { 2568c2ecf20Sopenharmony_ci struct cgroup_subsys_state *memcg_css; 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci if (page) { 2598c2ecf20Sopenharmony_ci memcg_css = mem_cgroup_css_from_page(page); 2608c2ecf20Sopenharmony_ci wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 2618c2ecf20Sopenharmony_ci } else { 2628c2ecf20Sopenharmony_ci /* must pin memcg_css, see wb_get_create() */ 2638c2ecf20Sopenharmony_ci memcg_css = task_get_css(current, memory_cgrp_id); 2648c2ecf20Sopenharmony_ci wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 2658c2ecf20Sopenharmony_ci css_put(memcg_css); 2668c2ecf20Sopenharmony_ci } 2678c2ecf20Sopenharmony_ci } 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci if (!wb) 2708c2ecf20Sopenharmony_ci wb = &bdi->wb; 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci /* 2738c2ecf20Sopenharmony_ci * There may be multiple instances of this function racing to 2748c2ecf20Sopenharmony_ci * update the same inode. Use cmpxchg() to tell the winner. 2758c2ecf20Sopenharmony_ci */ 2768c2ecf20Sopenharmony_ci if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) 2778c2ecf20Sopenharmony_ci wb_put(wb); 2788c2ecf20Sopenharmony_ci} 2798c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__inode_attach_wb); 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci/** 2828c2ecf20Sopenharmony_ci * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it 2838c2ecf20Sopenharmony_ci * @inode: inode of interest with i_lock held 2848c2ecf20Sopenharmony_ci * 2858c2ecf20Sopenharmony_ci * Returns @inode's wb with its list_lock held. @inode->i_lock must be 2868c2ecf20Sopenharmony_ci * held on entry and is released on return. The returned wb is guaranteed 2878c2ecf20Sopenharmony_ci * to stay @inode's associated wb until its list_lock is released. 2888c2ecf20Sopenharmony_ci */ 2898c2ecf20Sopenharmony_cistatic struct bdi_writeback * 2908c2ecf20Sopenharmony_cilocked_inode_to_wb_and_lock_list(struct inode *inode) 2918c2ecf20Sopenharmony_ci __releases(&inode->i_lock) 2928c2ecf20Sopenharmony_ci __acquires(&wb->list_lock) 2938c2ecf20Sopenharmony_ci{ 2948c2ecf20Sopenharmony_ci while (true) { 2958c2ecf20Sopenharmony_ci struct bdi_writeback *wb = inode_to_wb(inode); 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci /* 2988c2ecf20Sopenharmony_ci * inode_to_wb() association is protected by both 2998c2ecf20Sopenharmony_ci * @inode->i_lock and @wb->list_lock but list_lock nests 3008c2ecf20Sopenharmony_ci * outside i_lock. Drop i_lock and verify that the 3018c2ecf20Sopenharmony_ci * association hasn't changed after acquiring list_lock. 3028c2ecf20Sopenharmony_ci */ 3038c2ecf20Sopenharmony_ci wb_get(wb); 3048c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 3058c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci /* i_wb may have changed inbetween, can't use inode_to_wb() */ 3088c2ecf20Sopenharmony_ci if (likely(wb == inode->i_wb)) { 3098c2ecf20Sopenharmony_ci wb_put(wb); /* @inode already has ref */ 3108c2ecf20Sopenharmony_ci return wb; 3118c2ecf20Sopenharmony_ci } 3128c2ecf20Sopenharmony_ci 3138c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 3148c2ecf20Sopenharmony_ci wb_put(wb); 3158c2ecf20Sopenharmony_ci cpu_relax(); 3168c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 3178c2ecf20Sopenharmony_ci } 3188c2ecf20Sopenharmony_ci} 3198c2ecf20Sopenharmony_ci 3208c2ecf20Sopenharmony_ci/** 3218c2ecf20Sopenharmony_ci * inode_to_wb_and_lock_list - determine an inode's wb and lock it 3228c2ecf20Sopenharmony_ci * @inode: inode of interest 3238c2ecf20Sopenharmony_ci * 3248c2ecf20Sopenharmony_ci * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held 3258c2ecf20Sopenharmony_ci * on entry. 3268c2ecf20Sopenharmony_ci */ 3278c2ecf20Sopenharmony_cistatic struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) 3288c2ecf20Sopenharmony_ci __acquires(&wb->list_lock) 3298c2ecf20Sopenharmony_ci{ 3308c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 3318c2ecf20Sopenharmony_ci return locked_inode_to_wb_and_lock_list(inode); 3328c2ecf20Sopenharmony_ci} 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_cistruct inode_switch_wbs_context { 3358c2ecf20Sopenharmony_ci struct inode *inode; 3368c2ecf20Sopenharmony_ci struct bdi_writeback *new_wb; 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci struct rcu_head rcu_head; 3398c2ecf20Sopenharmony_ci struct work_struct work; 3408c2ecf20Sopenharmony_ci}; 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_cistatic void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) 3438c2ecf20Sopenharmony_ci{ 3448c2ecf20Sopenharmony_ci down_write(&bdi->wb_switch_rwsem); 3458c2ecf20Sopenharmony_ci} 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_cistatic void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) 3488c2ecf20Sopenharmony_ci{ 3498c2ecf20Sopenharmony_ci up_write(&bdi->wb_switch_rwsem); 3508c2ecf20Sopenharmony_ci} 3518c2ecf20Sopenharmony_ci 3528c2ecf20Sopenharmony_cistatic void inode_switch_wbs_work_fn(struct work_struct *work) 3538c2ecf20Sopenharmony_ci{ 3548c2ecf20Sopenharmony_ci struct inode_switch_wbs_context *isw = 3558c2ecf20Sopenharmony_ci container_of(work, struct inode_switch_wbs_context, work); 3568c2ecf20Sopenharmony_ci struct inode *inode = isw->inode; 3578c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(inode); 3588c2ecf20Sopenharmony_ci struct address_space *mapping = inode->i_mapping; 3598c2ecf20Sopenharmony_ci struct bdi_writeback *old_wb = inode->i_wb; 3608c2ecf20Sopenharmony_ci struct bdi_writeback *new_wb = isw->new_wb; 3618c2ecf20Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, 0); 3628c2ecf20Sopenharmony_ci struct page *page; 3638c2ecf20Sopenharmony_ci bool switched = false; 3648c2ecf20Sopenharmony_ci 3658c2ecf20Sopenharmony_ci /* 3668c2ecf20Sopenharmony_ci * If @inode switches cgwb membership while sync_inodes_sb() is 3678c2ecf20Sopenharmony_ci * being issued, sync_inodes_sb() might miss it. Synchronize. 3688c2ecf20Sopenharmony_ci */ 3698c2ecf20Sopenharmony_ci down_read(&bdi->wb_switch_rwsem); 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci /* 3728c2ecf20Sopenharmony_ci * By the time control reaches here, RCU grace period has passed 3738c2ecf20Sopenharmony_ci * since I_WB_SWITCH assertion and all wb stat update transactions 3748c2ecf20Sopenharmony_ci * between unlocked_inode_to_wb_begin/end() are guaranteed to be 3758c2ecf20Sopenharmony_ci * synchronizing against the i_pages lock. 3768c2ecf20Sopenharmony_ci * 3778c2ecf20Sopenharmony_ci * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock 3788c2ecf20Sopenharmony_ci * gives us exclusion against all wb related operations on @inode 3798c2ecf20Sopenharmony_ci * including IO list manipulations and stat updates. 3808c2ecf20Sopenharmony_ci */ 3818c2ecf20Sopenharmony_ci if (old_wb < new_wb) { 3828c2ecf20Sopenharmony_ci spin_lock(&old_wb->list_lock); 3838c2ecf20Sopenharmony_ci spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); 3848c2ecf20Sopenharmony_ci } else { 3858c2ecf20Sopenharmony_ci spin_lock(&new_wb->list_lock); 3868c2ecf20Sopenharmony_ci spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); 3878c2ecf20Sopenharmony_ci } 3888c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 3898c2ecf20Sopenharmony_ci xa_lock_irq(&mapping->i_pages); 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci /* 3928c2ecf20Sopenharmony_ci * Once I_FREEING is visible under i_lock, the eviction path owns 3938c2ecf20Sopenharmony_ci * the inode and we shouldn't modify ->i_io_list. 3948c2ecf20Sopenharmony_ci */ 3958c2ecf20Sopenharmony_ci if (unlikely(inode->i_state & I_FREEING)) 3968c2ecf20Sopenharmony_ci goto skip_switch; 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci trace_inode_switch_wbs(inode, old_wb, new_wb); 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci /* 4018c2ecf20Sopenharmony_ci * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points 4028c2ecf20Sopenharmony_ci * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to 4038c2ecf20Sopenharmony_ci * pages actually under writeback. 4048c2ecf20Sopenharmony_ci */ 4058c2ecf20Sopenharmony_ci xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { 4068c2ecf20Sopenharmony_ci if (PageDirty(page)) { 4078c2ecf20Sopenharmony_ci dec_wb_stat(old_wb, WB_RECLAIMABLE); 4088c2ecf20Sopenharmony_ci inc_wb_stat(new_wb, WB_RECLAIMABLE); 4098c2ecf20Sopenharmony_ci } 4108c2ecf20Sopenharmony_ci } 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci xas_set(&xas, 0); 4138c2ecf20Sopenharmony_ci xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { 4148c2ecf20Sopenharmony_ci WARN_ON_ONCE(!PageWriteback(page)); 4158c2ecf20Sopenharmony_ci dec_wb_stat(old_wb, WB_WRITEBACK); 4168c2ecf20Sopenharmony_ci inc_wb_stat(new_wb, WB_WRITEBACK); 4178c2ecf20Sopenharmony_ci } 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci wb_get(new_wb); 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci /* 4228c2ecf20Sopenharmony_ci * Transfer to @new_wb's IO list if necessary. The specific list 4238c2ecf20Sopenharmony_ci * @inode was on is ignored and the inode is put on ->b_dirty which 4248c2ecf20Sopenharmony_ci * is always correct including from ->b_dirty_time. The transfer 4258c2ecf20Sopenharmony_ci * preserves @inode->dirtied_when ordering. 4268c2ecf20Sopenharmony_ci */ 4278c2ecf20Sopenharmony_ci if (!list_empty(&inode->i_io_list)) { 4288c2ecf20Sopenharmony_ci struct inode *pos; 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci inode_io_list_del_locked(inode, old_wb); 4318c2ecf20Sopenharmony_ci inode->i_wb = new_wb; 4328c2ecf20Sopenharmony_ci list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) 4338c2ecf20Sopenharmony_ci if (time_after_eq(inode->dirtied_when, 4348c2ecf20Sopenharmony_ci pos->dirtied_when)) 4358c2ecf20Sopenharmony_ci break; 4368c2ecf20Sopenharmony_ci inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); 4378c2ecf20Sopenharmony_ci } else { 4388c2ecf20Sopenharmony_ci inode->i_wb = new_wb; 4398c2ecf20Sopenharmony_ci } 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ 4428c2ecf20Sopenharmony_ci inode->i_wb_frn_winner = 0; 4438c2ecf20Sopenharmony_ci inode->i_wb_frn_avg_time = 0; 4448c2ecf20Sopenharmony_ci inode->i_wb_frn_history = 0; 4458c2ecf20Sopenharmony_ci switched = true; 4468c2ecf20Sopenharmony_ciskip_switch: 4478c2ecf20Sopenharmony_ci /* 4488c2ecf20Sopenharmony_ci * Paired with load_acquire in unlocked_inode_to_wb_begin() and 4498c2ecf20Sopenharmony_ci * ensures that the new wb is visible if they see !I_WB_SWITCH. 4508c2ecf20Sopenharmony_ci */ 4518c2ecf20Sopenharmony_ci smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci xa_unlock_irq(&mapping->i_pages); 4548c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 4558c2ecf20Sopenharmony_ci spin_unlock(&new_wb->list_lock); 4568c2ecf20Sopenharmony_ci spin_unlock(&old_wb->list_lock); 4578c2ecf20Sopenharmony_ci 4588c2ecf20Sopenharmony_ci up_read(&bdi->wb_switch_rwsem); 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_ci if (switched) { 4618c2ecf20Sopenharmony_ci wb_wakeup(new_wb); 4628c2ecf20Sopenharmony_ci wb_put(old_wb); 4638c2ecf20Sopenharmony_ci } 4648c2ecf20Sopenharmony_ci wb_put(new_wb); 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci iput(inode); 4678c2ecf20Sopenharmony_ci kfree(isw); 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci atomic_dec(&isw_nr_in_flight); 4708c2ecf20Sopenharmony_ci} 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_cistatic void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) 4738c2ecf20Sopenharmony_ci{ 4748c2ecf20Sopenharmony_ci struct inode_switch_wbs_context *isw = container_of(rcu_head, 4758c2ecf20Sopenharmony_ci struct inode_switch_wbs_context, rcu_head); 4768c2ecf20Sopenharmony_ci 4778c2ecf20Sopenharmony_ci /* needs to grab bh-unsafe locks, bounce to work item */ 4788c2ecf20Sopenharmony_ci INIT_WORK(&isw->work, inode_switch_wbs_work_fn); 4798c2ecf20Sopenharmony_ci queue_work(isw_wq, &isw->work); 4808c2ecf20Sopenharmony_ci} 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci/** 4838c2ecf20Sopenharmony_ci * inode_switch_wbs - change the wb association of an inode 4848c2ecf20Sopenharmony_ci * @inode: target inode 4858c2ecf20Sopenharmony_ci * @new_wb_id: ID of the new wb 4868c2ecf20Sopenharmony_ci * 4878c2ecf20Sopenharmony_ci * Switch @inode's wb association to the wb identified by @new_wb_id. The 4888c2ecf20Sopenharmony_ci * switching is performed asynchronously and may fail silently. 4898c2ecf20Sopenharmony_ci */ 4908c2ecf20Sopenharmony_cistatic void inode_switch_wbs(struct inode *inode, int new_wb_id) 4918c2ecf20Sopenharmony_ci{ 4928c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(inode); 4938c2ecf20Sopenharmony_ci struct cgroup_subsys_state *memcg_css; 4948c2ecf20Sopenharmony_ci struct inode_switch_wbs_context *isw; 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci /* noop if seems to be already in progress */ 4978c2ecf20Sopenharmony_ci if (inode->i_state & I_WB_SWITCH) 4988c2ecf20Sopenharmony_ci return; 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci /* avoid queueing a new switch if too many are already in flight */ 5018c2ecf20Sopenharmony_ci if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) 5028c2ecf20Sopenharmony_ci return; 5038c2ecf20Sopenharmony_ci 5048c2ecf20Sopenharmony_ci isw = kzalloc(sizeof(*isw), GFP_ATOMIC); 5058c2ecf20Sopenharmony_ci if (!isw) 5068c2ecf20Sopenharmony_ci return; 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci atomic_inc(&isw_nr_in_flight); 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci /* find and pin the new wb */ 5118c2ecf20Sopenharmony_ci rcu_read_lock(); 5128c2ecf20Sopenharmony_ci memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); 5138c2ecf20Sopenharmony_ci if (memcg_css && !css_tryget(memcg_css)) 5148c2ecf20Sopenharmony_ci memcg_css = NULL; 5158c2ecf20Sopenharmony_ci rcu_read_unlock(); 5168c2ecf20Sopenharmony_ci if (!memcg_css) 5178c2ecf20Sopenharmony_ci goto out_free; 5188c2ecf20Sopenharmony_ci 5198c2ecf20Sopenharmony_ci isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 5208c2ecf20Sopenharmony_ci css_put(memcg_css); 5218c2ecf20Sopenharmony_ci if (!isw->new_wb) 5228c2ecf20Sopenharmony_ci goto out_free; 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci /* while holding I_WB_SWITCH, no one else can update the association */ 5258c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 5268c2ecf20Sopenharmony_ci if (!(inode->i_sb->s_flags & SB_ACTIVE) || 5278c2ecf20Sopenharmony_ci inode->i_state & (I_WB_SWITCH | I_FREEING) || 5288c2ecf20Sopenharmony_ci inode_to_wb(inode) == isw->new_wb) { 5298c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 5308c2ecf20Sopenharmony_ci goto out_free; 5318c2ecf20Sopenharmony_ci } 5328c2ecf20Sopenharmony_ci inode->i_state |= I_WB_SWITCH; 5338c2ecf20Sopenharmony_ci __iget(inode); 5348c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci isw->inode = inode; 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci /* 5398c2ecf20Sopenharmony_ci * In addition to synchronizing among switchers, I_WB_SWITCH tells 5408c2ecf20Sopenharmony_ci * the RCU protected stat update paths to grab the i_page 5418c2ecf20Sopenharmony_ci * lock so that stat transfer can synchronize against them. 5428c2ecf20Sopenharmony_ci * Let's continue after I_WB_SWITCH is guaranteed to be visible. 5438c2ecf20Sopenharmony_ci */ 5448c2ecf20Sopenharmony_ci call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); 5458c2ecf20Sopenharmony_ci return; 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ciout_free: 5488c2ecf20Sopenharmony_ci atomic_dec(&isw_nr_in_flight); 5498c2ecf20Sopenharmony_ci if (isw->new_wb) 5508c2ecf20Sopenharmony_ci wb_put(isw->new_wb); 5518c2ecf20Sopenharmony_ci kfree(isw); 5528c2ecf20Sopenharmony_ci} 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci/** 5558c2ecf20Sopenharmony_ci * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it 5568c2ecf20Sopenharmony_ci * @wbc: writeback_control of interest 5578c2ecf20Sopenharmony_ci * @inode: target inode 5588c2ecf20Sopenharmony_ci * 5598c2ecf20Sopenharmony_ci * @inode is locked and about to be written back under the control of @wbc. 5608c2ecf20Sopenharmony_ci * Record @inode's writeback context into @wbc and unlock the i_lock. On 5618c2ecf20Sopenharmony_ci * writeback completion, wbc_detach_inode() should be called. This is used 5628c2ecf20Sopenharmony_ci * to track the cgroup writeback context. 5638c2ecf20Sopenharmony_ci */ 5648c2ecf20Sopenharmony_civoid wbc_attach_and_unlock_inode(struct writeback_control *wbc, 5658c2ecf20Sopenharmony_ci struct inode *inode) 5668c2ecf20Sopenharmony_ci{ 5678c2ecf20Sopenharmony_ci if (!inode_cgwb_enabled(inode)) { 5688c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 5698c2ecf20Sopenharmony_ci return; 5708c2ecf20Sopenharmony_ci } 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci wbc->wb = inode_to_wb(inode); 5738c2ecf20Sopenharmony_ci wbc->inode = inode; 5748c2ecf20Sopenharmony_ci 5758c2ecf20Sopenharmony_ci wbc->wb_id = wbc->wb->memcg_css->id; 5768c2ecf20Sopenharmony_ci wbc->wb_lcand_id = inode->i_wb_frn_winner; 5778c2ecf20Sopenharmony_ci wbc->wb_tcand_id = 0; 5788c2ecf20Sopenharmony_ci wbc->wb_bytes = 0; 5798c2ecf20Sopenharmony_ci wbc->wb_lcand_bytes = 0; 5808c2ecf20Sopenharmony_ci wbc->wb_tcand_bytes = 0; 5818c2ecf20Sopenharmony_ci 5828c2ecf20Sopenharmony_ci wb_get(wbc->wb); 5838c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci /* 5868c2ecf20Sopenharmony_ci * A dying wb indicates that either the blkcg associated with the 5878c2ecf20Sopenharmony_ci * memcg changed or the associated memcg is dying. In the first 5888c2ecf20Sopenharmony_ci * case, a replacement wb should already be available and we should 5898c2ecf20Sopenharmony_ci * refresh the wb immediately. In the second case, trying to 5908c2ecf20Sopenharmony_ci * refresh will keep failing. 5918c2ecf20Sopenharmony_ci */ 5928c2ecf20Sopenharmony_ci if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) 5938c2ecf20Sopenharmony_ci inode_switch_wbs(inode, wbc->wb_id); 5948c2ecf20Sopenharmony_ci} 5958c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ci/** 5988c2ecf20Sopenharmony_ci * wbc_detach_inode - disassociate wbc from inode and perform foreign detection 5998c2ecf20Sopenharmony_ci * @wbc: writeback_control of the just finished writeback 6008c2ecf20Sopenharmony_ci * 6018c2ecf20Sopenharmony_ci * To be called after a writeback attempt of an inode finishes and undoes 6028c2ecf20Sopenharmony_ci * wbc_attach_and_unlock_inode(). Can be called under any context. 6038c2ecf20Sopenharmony_ci * 6048c2ecf20Sopenharmony_ci * As concurrent write sharing of an inode is expected to be very rare and 6058c2ecf20Sopenharmony_ci * memcg only tracks page ownership on first-use basis severely confining 6068c2ecf20Sopenharmony_ci * the usefulness of such sharing, cgroup writeback tracks ownership 6078c2ecf20Sopenharmony_ci * per-inode. While the support for concurrent write sharing of an inode 6088c2ecf20Sopenharmony_ci * is deemed unnecessary, an inode being written to by different cgroups at 6098c2ecf20Sopenharmony_ci * different points in time is a lot more common, and, more importantly, 6108c2ecf20Sopenharmony_ci * charging only by first-use can too readily lead to grossly incorrect 6118c2ecf20Sopenharmony_ci * behaviors (single foreign page can lead to gigabytes of writeback to be 6128c2ecf20Sopenharmony_ci * incorrectly attributed). 6138c2ecf20Sopenharmony_ci * 6148c2ecf20Sopenharmony_ci * To resolve this issue, cgroup writeback detects the majority dirtier of 6158c2ecf20Sopenharmony_ci * an inode and transfers the ownership to it. To avoid unnnecessary 6168c2ecf20Sopenharmony_ci * oscillation, the detection mechanism keeps track of history and gives 6178c2ecf20Sopenharmony_ci * out the switch verdict only if the foreign usage pattern is stable over 6188c2ecf20Sopenharmony_ci * a certain amount of time and/or writeback attempts. 6198c2ecf20Sopenharmony_ci * 6208c2ecf20Sopenharmony_ci * On each writeback attempt, @wbc tries to detect the majority writer 6218c2ecf20Sopenharmony_ci * using Boyer-Moore majority vote algorithm. In addition to the byte 6228c2ecf20Sopenharmony_ci * count from the majority voting, it also counts the bytes written for the 6238c2ecf20Sopenharmony_ci * current wb and the last round's winner wb (max of last round's current 6248c2ecf20Sopenharmony_ci * wb, the winner from two rounds ago, and the last round's majority 6258c2ecf20Sopenharmony_ci * candidate). Keeping track of the historical winner helps the algorithm 6268c2ecf20Sopenharmony_ci * to semi-reliably detect the most active writer even when it's not the 6278c2ecf20Sopenharmony_ci * absolute majority. 6288c2ecf20Sopenharmony_ci * 6298c2ecf20Sopenharmony_ci * Once the winner of the round is determined, whether the winner is 6308c2ecf20Sopenharmony_ci * foreign or not and how much IO time the round consumed is recorded in 6318c2ecf20Sopenharmony_ci * inode->i_wb_frn_history. If the amount of recorded foreign IO time is 6328c2ecf20Sopenharmony_ci * over a certain threshold, the switch verdict is given. 6338c2ecf20Sopenharmony_ci */ 6348c2ecf20Sopenharmony_civoid wbc_detach_inode(struct writeback_control *wbc) 6358c2ecf20Sopenharmony_ci{ 6368c2ecf20Sopenharmony_ci struct bdi_writeback *wb = wbc->wb; 6378c2ecf20Sopenharmony_ci struct inode *inode = wbc->inode; 6388c2ecf20Sopenharmony_ci unsigned long avg_time, max_bytes, max_time; 6398c2ecf20Sopenharmony_ci u16 history; 6408c2ecf20Sopenharmony_ci int max_id; 6418c2ecf20Sopenharmony_ci 6428c2ecf20Sopenharmony_ci if (!wb) 6438c2ecf20Sopenharmony_ci return; 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_ci history = inode->i_wb_frn_history; 6468c2ecf20Sopenharmony_ci avg_time = inode->i_wb_frn_avg_time; 6478c2ecf20Sopenharmony_ci 6488c2ecf20Sopenharmony_ci /* pick the winner of this round */ 6498c2ecf20Sopenharmony_ci if (wbc->wb_bytes >= wbc->wb_lcand_bytes && 6508c2ecf20Sopenharmony_ci wbc->wb_bytes >= wbc->wb_tcand_bytes) { 6518c2ecf20Sopenharmony_ci max_id = wbc->wb_id; 6528c2ecf20Sopenharmony_ci max_bytes = wbc->wb_bytes; 6538c2ecf20Sopenharmony_ci } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { 6548c2ecf20Sopenharmony_ci max_id = wbc->wb_lcand_id; 6558c2ecf20Sopenharmony_ci max_bytes = wbc->wb_lcand_bytes; 6568c2ecf20Sopenharmony_ci } else { 6578c2ecf20Sopenharmony_ci max_id = wbc->wb_tcand_id; 6588c2ecf20Sopenharmony_ci max_bytes = wbc->wb_tcand_bytes; 6598c2ecf20Sopenharmony_ci } 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci /* 6628c2ecf20Sopenharmony_ci * Calculate the amount of IO time the winner consumed and fold it 6638c2ecf20Sopenharmony_ci * into the running average kept per inode. If the consumed IO 6648c2ecf20Sopenharmony_ci * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for 6658c2ecf20Sopenharmony_ci * deciding whether to switch or not. This is to prevent one-off 6668c2ecf20Sopenharmony_ci * small dirtiers from skewing the verdict. 6678c2ecf20Sopenharmony_ci */ 6688c2ecf20Sopenharmony_ci max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, 6698c2ecf20Sopenharmony_ci wb->avg_write_bandwidth); 6708c2ecf20Sopenharmony_ci if (avg_time) 6718c2ecf20Sopenharmony_ci avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - 6728c2ecf20Sopenharmony_ci (avg_time >> WB_FRN_TIME_AVG_SHIFT); 6738c2ecf20Sopenharmony_ci else 6748c2ecf20Sopenharmony_ci avg_time = max_time; /* immediate catch up on first run */ 6758c2ecf20Sopenharmony_ci 6768c2ecf20Sopenharmony_ci if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { 6778c2ecf20Sopenharmony_ci int slots; 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci /* 6808c2ecf20Sopenharmony_ci * The switch verdict is reached if foreign wb's consume 6818c2ecf20Sopenharmony_ci * more than a certain proportion of IO time in a 6828c2ecf20Sopenharmony_ci * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot 6838c2ecf20Sopenharmony_ci * history mask where each bit represents one sixteenth of 6848c2ecf20Sopenharmony_ci * the period. Determine the number of slots to shift into 6858c2ecf20Sopenharmony_ci * history from @max_time. 6868c2ecf20Sopenharmony_ci */ 6878c2ecf20Sopenharmony_ci slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), 6888c2ecf20Sopenharmony_ci (unsigned long)WB_FRN_HIST_MAX_SLOTS); 6898c2ecf20Sopenharmony_ci history <<= slots; 6908c2ecf20Sopenharmony_ci if (wbc->wb_id != max_id) 6918c2ecf20Sopenharmony_ci history |= (1U << slots) - 1; 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci if (history) 6948c2ecf20Sopenharmony_ci trace_inode_foreign_history(inode, wbc, history); 6958c2ecf20Sopenharmony_ci 6968c2ecf20Sopenharmony_ci /* 6978c2ecf20Sopenharmony_ci * Switch if the current wb isn't the consistent winner. 6988c2ecf20Sopenharmony_ci * If there are multiple closely competing dirtiers, the 6998c2ecf20Sopenharmony_ci * inode may switch across them repeatedly over time, which 7008c2ecf20Sopenharmony_ci * is okay. The main goal is avoiding keeping an inode on 7018c2ecf20Sopenharmony_ci * the wrong wb for an extended period of time. 7028c2ecf20Sopenharmony_ci */ 7038c2ecf20Sopenharmony_ci if (hweight16(history) > WB_FRN_HIST_THR_SLOTS) 7048c2ecf20Sopenharmony_ci inode_switch_wbs(inode, max_id); 7058c2ecf20Sopenharmony_ci } 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ci /* 7088c2ecf20Sopenharmony_ci * Multiple instances of this function may race to update the 7098c2ecf20Sopenharmony_ci * following fields but we don't mind occassional inaccuracies. 7108c2ecf20Sopenharmony_ci */ 7118c2ecf20Sopenharmony_ci inode->i_wb_frn_winner = max_id; 7128c2ecf20Sopenharmony_ci inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); 7138c2ecf20Sopenharmony_ci inode->i_wb_frn_history = history; 7148c2ecf20Sopenharmony_ci 7158c2ecf20Sopenharmony_ci wb_put(wbc->wb); 7168c2ecf20Sopenharmony_ci wbc->wb = NULL; 7178c2ecf20Sopenharmony_ci} 7188c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(wbc_detach_inode); 7198c2ecf20Sopenharmony_ci 7208c2ecf20Sopenharmony_ci/** 7218c2ecf20Sopenharmony_ci * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership 7228c2ecf20Sopenharmony_ci * @wbc: writeback_control of the writeback in progress 7238c2ecf20Sopenharmony_ci * @page: page being written out 7248c2ecf20Sopenharmony_ci * @bytes: number of bytes being written out 7258c2ecf20Sopenharmony_ci * 7268c2ecf20Sopenharmony_ci * @bytes from @page are about to written out during the writeback 7278c2ecf20Sopenharmony_ci * controlled by @wbc. Keep the book for foreign inode detection. See 7288c2ecf20Sopenharmony_ci * wbc_detach_inode(). 7298c2ecf20Sopenharmony_ci */ 7308c2ecf20Sopenharmony_civoid wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, 7318c2ecf20Sopenharmony_ci size_t bytes) 7328c2ecf20Sopenharmony_ci{ 7338c2ecf20Sopenharmony_ci struct cgroup_subsys_state *css; 7348c2ecf20Sopenharmony_ci int id; 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci /* 7378c2ecf20Sopenharmony_ci * pageout() path doesn't attach @wbc to the inode being written 7388c2ecf20Sopenharmony_ci * out. This is intentional as we don't want the function to block 7398c2ecf20Sopenharmony_ci * behind a slow cgroup. Ultimately, we want pageout() to kick off 7408c2ecf20Sopenharmony_ci * regular writeback instead of writing things out itself. 7418c2ecf20Sopenharmony_ci */ 7428c2ecf20Sopenharmony_ci if (!wbc->wb || wbc->no_cgroup_owner) 7438c2ecf20Sopenharmony_ci return; 7448c2ecf20Sopenharmony_ci 7458c2ecf20Sopenharmony_ci css = mem_cgroup_css_from_page(page); 7468c2ecf20Sopenharmony_ci /* dead cgroups shouldn't contribute to inode ownership arbitration */ 7478c2ecf20Sopenharmony_ci if (!(css->flags & CSS_ONLINE)) 7488c2ecf20Sopenharmony_ci return; 7498c2ecf20Sopenharmony_ci 7508c2ecf20Sopenharmony_ci id = css->id; 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci if (id == wbc->wb_id) { 7538c2ecf20Sopenharmony_ci wbc->wb_bytes += bytes; 7548c2ecf20Sopenharmony_ci return; 7558c2ecf20Sopenharmony_ci } 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci if (id == wbc->wb_lcand_id) 7588c2ecf20Sopenharmony_ci wbc->wb_lcand_bytes += bytes; 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci /* Boyer-Moore majority vote algorithm */ 7618c2ecf20Sopenharmony_ci if (!wbc->wb_tcand_bytes) 7628c2ecf20Sopenharmony_ci wbc->wb_tcand_id = id; 7638c2ecf20Sopenharmony_ci if (id == wbc->wb_tcand_id) 7648c2ecf20Sopenharmony_ci wbc->wb_tcand_bytes += bytes; 7658c2ecf20Sopenharmony_ci else 7668c2ecf20Sopenharmony_ci wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); 7678c2ecf20Sopenharmony_ci} 7688c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_ci/** 7718c2ecf20Sopenharmony_ci * inode_congested - test whether an inode is congested 7728c2ecf20Sopenharmony_ci * @inode: inode to test for congestion (may be NULL) 7738c2ecf20Sopenharmony_ci * @cong_bits: mask of WB_[a]sync_congested bits to test 7748c2ecf20Sopenharmony_ci * 7758c2ecf20Sopenharmony_ci * Tests whether @inode is congested. @cong_bits is the mask of congestion 7768c2ecf20Sopenharmony_ci * bits to test and the return value is the mask of set bits. 7778c2ecf20Sopenharmony_ci * 7788c2ecf20Sopenharmony_ci * If cgroup writeback is enabled for @inode, the congestion state is 7798c2ecf20Sopenharmony_ci * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg 7808c2ecf20Sopenharmony_ci * associated with @inode is congested; otherwise, the root wb's congestion 7818c2ecf20Sopenharmony_ci * state is used. 7828c2ecf20Sopenharmony_ci * 7838c2ecf20Sopenharmony_ci * @inode is allowed to be NULL as this function is often called on 7848c2ecf20Sopenharmony_ci * mapping->host which is NULL for the swapper space. 7858c2ecf20Sopenharmony_ci */ 7868c2ecf20Sopenharmony_ciint inode_congested(struct inode *inode, int cong_bits) 7878c2ecf20Sopenharmony_ci{ 7888c2ecf20Sopenharmony_ci /* 7898c2ecf20Sopenharmony_ci * Once set, ->i_wb never becomes NULL while the inode is alive. 7908c2ecf20Sopenharmony_ci * Start transaction iff ->i_wb is visible. 7918c2ecf20Sopenharmony_ci */ 7928c2ecf20Sopenharmony_ci if (inode && inode_to_wb_is_valid(inode)) { 7938c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 7948c2ecf20Sopenharmony_ci struct wb_lock_cookie lock_cookie = {}; 7958c2ecf20Sopenharmony_ci bool congested; 7968c2ecf20Sopenharmony_ci 7978c2ecf20Sopenharmony_ci wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); 7988c2ecf20Sopenharmony_ci congested = wb_congested(wb, cong_bits); 7998c2ecf20Sopenharmony_ci unlocked_inode_to_wb_end(inode, &lock_cookie); 8008c2ecf20Sopenharmony_ci return congested; 8018c2ecf20Sopenharmony_ci } 8028c2ecf20Sopenharmony_ci 8038c2ecf20Sopenharmony_ci return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); 8048c2ecf20Sopenharmony_ci} 8058c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(inode_congested); 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci/** 8088c2ecf20Sopenharmony_ci * wb_split_bdi_pages - split nr_pages to write according to bandwidth 8098c2ecf20Sopenharmony_ci * @wb: target bdi_writeback to split @nr_pages to 8108c2ecf20Sopenharmony_ci * @nr_pages: number of pages to write for the whole bdi 8118c2ecf20Sopenharmony_ci * 8128c2ecf20Sopenharmony_ci * Split @wb's portion of @nr_pages according to @wb's write bandwidth in 8138c2ecf20Sopenharmony_ci * relation to the total write bandwidth of all wb's w/ dirty inodes on 8148c2ecf20Sopenharmony_ci * @wb->bdi. 8158c2ecf20Sopenharmony_ci */ 8168c2ecf20Sopenharmony_cistatic long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) 8178c2ecf20Sopenharmony_ci{ 8188c2ecf20Sopenharmony_ci unsigned long this_bw = wb->avg_write_bandwidth; 8198c2ecf20Sopenharmony_ci unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); 8208c2ecf20Sopenharmony_ci 8218c2ecf20Sopenharmony_ci if (nr_pages == LONG_MAX) 8228c2ecf20Sopenharmony_ci return LONG_MAX; 8238c2ecf20Sopenharmony_ci 8248c2ecf20Sopenharmony_ci /* 8258c2ecf20Sopenharmony_ci * This may be called on clean wb's and proportional distribution 8268c2ecf20Sopenharmony_ci * may not make sense, just use the original @nr_pages in those 8278c2ecf20Sopenharmony_ci * cases. In general, we wanna err on the side of writing more. 8288c2ecf20Sopenharmony_ci */ 8298c2ecf20Sopenharmony_ci if (!tot_bw || this_bw >= tot_bw) 8308c2ecf20Sopenharmony_ci return nr_pages; 8318c2ecf20Sopenharmony_ci else 8328c2ecf20Sopenharmony_ci return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); 8338c2ecf20Sopenharmony_ci} 8348c2ecf20Sopenharmony_ci 8358c2ecf20Sopenharmony_ci/** 8368c2ecf20Sopenharmony_ci * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi 8378c2ecf20Sopenharmony_ci * @bdi: target backing_dev_info 8388c2ecf20Sopenharmony_ci * @base_work: wb_writeback_work to issue 8398c2ecf20Sopenharmony_ci * @skip_if_busy: skip wb's which already have writeback in progress 8408c2ecf20Sopenharmony_ci * 8418c2ecf20Sopenharmony_ci * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which 8428c2ecf20Sopenharmony_ci * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's 8438c2ecf20Sopenharmony_ci * distributed to the busy wbs according to each wb's proportion in the 8448c2ecf20Sopenharmony_ci * total active write bandwidth of @bdi. 8458c2ecf20Sopenharmony_ci */ 8468c2ecf20Sopenharmony_cistatic void bdi_split_work_to_wbs(struct backing_dev_info *bdi, 8478c2ecf20Sopenharmony_ci struct wb_writeback_work *base_work, 8488c2ecf20Sopenharmony_ci bool skip_if_busy) 8498c2ecf20Sopenharmony_ci{ 8508c2ecf20Sopenharmony_ci struct bdi_writeback *last_wb = NULL; 8518c2ecf20Sopenharmony_ci struct bdi_writeback *wb = list_entry(&bdi->wb_list, 8528c2ecf20Sopenharmony_ci struct bdi_writeback, bdi_node); 8538c2ecf20Sopenharmony_ci 8548c2ecf20Sopenharmony_ci might_sleep(); 8558c2ecf20Sopenharmony_cirestart: 8568c2ecf20Sopenharmony_ci rcu_read_lock(); 8578c2ecf20Sopenharmony_ci list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { 8588c2ecf20Sopenharmony_ci DEFINE_WB_COMPLETION(fallback_work_done, bdi); 8598c2ecf20Sopenharmony_ci struct wb_writeback_work fallback_work; 8608c2ecf20Sopenharmony_ci struct wb_writeback_work *work; 8618c2ecf20Sopenharmony_ci long nr_pages; 8628c2ecf20Sopenharmony_ci 8638c2ecf20Sopenharmony_ci if (last_wb) { 8648c2ecf20Sopenharmony_ci wb_put(last_wb); 8658c2ecf20Sopenharmony_ci last_wb = NULL; 8668c2ecf20Sopenharmony_ci } 8678c2ecf20Sopenharmony_ci 8688c2ecf20Sopenharmony_ci /* SYNC_ALL writes out I_DIRTY_TIME too */ 8698c2ecf20Sopenharmony_ci if (!wb_has_dirty_io(wb) && 8708c2ecf20Sopenharmony_ci (base_work->sync_mode == WB_SYNC_NONE || 8718c2ecf20Sopenharmony_ci list_empty(&wb->b_dirty_time))) 8728c2ecf20Sopenharmony_ci continue; 8738c2ecf20Sopenharmony_ci if (skip_if_busy && writeback_in_progress(wb)) 8748c2ecf20Sopenharmony_ci continue; 8758c2ecf20Sopenharmony_ci 8768c2ecf20Sopenharmony_ci nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci work = kmalloc(sizeof(*work), GFP_ATOMIC); 8798c2ecf20Sopenharmony_ci if (work) { 8808c2ecf20Sopenharmony_ci *work = *base_work; 8818c2ecf20Sopenharmony_ci work->nr_pages = nr_pages; 8828c2ecf20Sopenharmony_ci work->auto_free = 1; 8838c2ecf20Sopenharmony_ci wb_queue_work(wb, work); 8848c2ecf20Sopenharmony_ci continue; 8858c2ecf20Sopenharmony_ci } 8868c2ecf20Sopenharmony_ci 8878c2ecf20Sopenharmony_ci /* 8888c2ecf20Sopenharmony_ci * If wb_tryget fails, the wb has been shutdown, skip it. 8898c2ecf20Sopenharmony_ci * 8908c2ecf20Sopenharmony_ci * Pin @wb so that it stays on @bdi->wb_list. This allows 8918c2ecf20Sopenharmony_ci * continuing iteration from @wb after dropping and 8928c2ecf20Sopenharmony_ci * regrabbing rcu read lock. 8938c2ecf20Sopenharmony_ci */ 8948c2ecf20Sopenharmony_ci if (!wb_tryget(wb)) 8958c2ecf20Sopenharmony_ci continue; 8968c2ecf20Sopenharmony_ci 8978c2ecf20Sopenharmony_ci /* alloc failed, execute synchronously using on-stack fallback */ 8988c2ecf20Sopenharmony_ci work = &fallback_work; 8998c2ecf20Sopenharmony_ci *work = *base_work; 9008c2ecf20Sopenharmony_ci work->nr_pages = nr_pages; 9018c2ecf20Sopenharmony_ci work->auto_free = 0; 9028c2ecf20Sopenharmony_ci work->done = &fallback_work_done; 9038c2ecf20Sopenharmony_ci 9048c2ecf20Sopenharmony_ci wb_queue_work(wb, work); 9058c2ecf20Sopenharmony_ci last_wb = wb; 9068c2ecf20Sopenharmony_ci 9078c2ecf20Sopenharmony_ci rcu_read_unlock(); 9088c2ecf20Sopenharmony_ci wb_wait_for_completion(&fallback_work_done); 9098c2ecf20Sopenharmony_ci goto restart; 9108c2ecf20Sopenharmony_ci } 9118c2ecf20Sopenharmony_ci rcu_read_unlock(); 9128c2ecf20Sopenharmony_ci 9138c2ecf20Sopenharmony_ci if (last_wb) 9148c2ecf20Sopenharmony_ci wb_put(last_wb); 9158c2ecf20Sopenharmony_ci} 9168c2ecf20Sopenharmony_ci 9178c2ecf20Sopenharmony_ci/** 9188c2ecf20Sopenharmony_ci * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs 9198c2ecf20Sopenharmony_ci * @bdi_id: target bdi id 9208c2ecf20Sopenharmony_ci * @memcg_id: target memcg css id 9218c2ecf20Sopenharmony_ci * @nr: number of pages to write, 0 for best-effort dirty flushing 9228c2ecf20Sopenharmony_ci * @reason: reason why some writeback work initiated 9238c2ecf20Sopenharmony_ci * @done: target wb_completion 9248c2ecf20Sopenharmony_ci * 9258c2ecf20Sopenharmony_ci * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id 9268c2ecf20Sopenharmony_ci * with the specified parameters. 9278c2ecf20Sopenharmony_ci */ 9288c2ecf20Sopenharmony_ciint cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, 9298c2ecf20Sopenharmony_ci enum wb_reason reason, struct wb_completion *done) 9308c2ecf20Sopenharmony_ci{ 9318c2ecf20Sopenharmony_ci struct backing_dev_info *bdi; 9328c2ecf20Sopenharmony_ci struct cgroup_subsys_state *memcg_css; 9338c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 9348c2ecf20Sopenharmony_ci struct wb_writeback_work *work; 9358c2ecf20Sopenharmony_ci int ret; 9368c2ecf20Sopenharmony_ci 9378c2ecf20Sopenharmony_ci /* lookup bdi and memcg */ 9388c2ecf20Sopenharmony_ci bdi = bdi_get_by_id(bdi_id); 9398c2ecf20Sopenharmony_ci if (!bdi) 9408c2ecf20Sopenharmony_ci return -ENOENT; 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_ci rcu_read_lock(); 9438c2ecf20Sopenharmony_ci memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys); 9448c2ecf20Sopenharmony_ci if (memcg_css && !css_tryget(memcg_css)) 9458c2ecf20Sopenharmony_ci memcg_css = NULL; 9468c2ecf20Sopenharmony_ci rcu_read_unlock(); 9478c2ecf20Sopenharmony_ci if (!memcg_css) { 9488c2ecf20Sopenharmony_ci ret = -ENOENT; 9498c2ecf20Sopenharmony_ci goto out_bdi_put; 9508c2ecf20Sopenharmony_ci } 9518c2ecf20Sopenharmony_ci 9528c2ecf20Sopenharmony_ci /* 9538c2ecf20Sopenharmony_ci * And find the associated wb. If the wb isn't there already 9548c2ecf20Sopenharmony_ci * there's nothing to flush, don't create one. 9558c2ecf20Sopenharmony_ci */ 9568c2ecf20Sopenharmony_ci wb = wb_get_lookup(bdi, memcg_css); 9578c2ecf20Sopenharmony_ci if (!wb) { 9588c2ecf20Sopenharmony_ci ret = -ENOENT; 9598c2ecf20Sopenharmony_ci goto out_css_put; 9608c2ecf20Sopenharmony_ci } 9618c2ecf20Sopenharmony_ci 9628c2ecf20Sopenharmony_ci /* 9638c2ecf20Sopenharmony_ci * If @nr is zero, the caller is attempting to write out most of 9648c2ecf20Sopenharmony_ci * the currently dirty pages. Let's take the current dirty page 9658c2ecf20Sopenharmony_ci * count and inflate it by 25% which should be large enough to 9668c2ecf20Sopenharmony_ci * flush out most dirty pages while avoiding getting livelocked by 9678c2ecf20Sopenharmony_ci * concurrent dirtiers. 9688c2ecf20Sopenharmony_ci */ 9698c2ecf20Sopenharmony_ci if (!nr) { 9708c2ecf20Sopenharmony_ci unsigned long filepages, headroom, dirty, writeback; 9718c2ecf20Sopenharmony_ci 9728c2ecf20Sopenharmony_ci mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, 9738c2ecf20Sopenharmony_ci &writeback); 9748c2ecf20Sopenharmony_ci nr = dirty * 10 / 8; 9758c2ecf20Sopenharmony_ci } 9768c2ecf20Sopenharmony_ci 9778c2ecf20Sopenharmony_ci /* issue the writeback work */ 9788c2ecf20Sopenharmony_ci work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); 9798c2ecf20Sopenharmony_ci if (work) { 9808c2ecf20Sopenharmony_ci work->nr_pages = nr; 9818c2ecf20Sopenharmony_ci work->sync_mode = WB_SYNC_NONE; 9828c2ecf20Sopenharmony_ci work->range_cyclic = 1; 9838c2ecf20Sopenharmony_ci work->reason = reason; 9848c2ecf20Sopenharmony_ci work->done = done; 9858c2ecf20Sopenharmony_ci work->auto_free = 1; 9868c2ecf20Sopenharmony_ci wb_queue_work(wb, work); 9878c2ecf20Sopenharmony_ci ret = 0; 9888c2ecf20Sopenharmony_ci } else { 9898c2ecf20Sopenharmony_ci ret = -ENOMEM; 9908c2ecf20Sopenharmony_ci } 9918c2ecf20Sopenharmony_ci 9928c2ecf20Sopenharmony_ci wb_put(wb); 9938c2ecf20Sopenharmony_ciout_css_put: 9948c2ecf20Sopenharmony_ci css_put(memcg_css); 9958c2ecf20Sopenharmony_ciout_bdi_put: 9968c2ecf20Sopenharmony_ci bdi_put(bdi); 9978c2ecf20Sopenharmony_ci return ret; 9988c2ecf20Sopenharmony_ci} 9998c2ecf20Sopenharmony_ci 10008c2ecf20Sopenharmony_ci/** 10018c2ecf20Sopenharmony_ci * cgroup_writeback_umount - flush inode wb switches for umount 10028c2ecf20Sopenharmony_ci * 10038c2ecf20Sopenharmony_ci * This function is called when a super_block is about to be destroyed and 10048c2ecf20Sopenharmony_ci * flushes in-flight inode wb switches. An inode wb switch goes through 10058c2ecf20Sopenharmony_ci * RCU and then workqueue, so the two need to be flushed in order to ensure 10068c2ecf20Sopenharmony_ci * that all previously scheduled switches are finished. As wb switches are 10078c2ecf20Sopenharmony_ci * rare occurrences and synchronize_rcu() can take a while, perform 10088c2ecf20Sopenharmony_ci * flushing iff wb switches are in flight. 10098c2ecf20Sopenharmony_ci */ 10108c2ecf20Sopenharmony_civoid cgroup_writeback_umount(void) 10118c2ecf20Sopenharmony_ci{ 10128c2ecf20Sopenharmony_ci if (atomic_read(&isw_nr_in_flight)) { 10138c2ecf20Sopenharmony_ci /* 10148c2ecf20Sopenharmony_ci * Use rcu_barrier() to wait for all pending callbacks to 10158c2ecf20Sopenharmony_ci * ensure that all in-flight wb switches are in the workqueue. 10168c2ecf20Sopenharmony_ci */ 10178c2ecf20Sopenharmony_ci rcu_barrier(); 10188c2ecf20Sopenharmony_ci flush_workqueue(isw_wq); 10198c2ecf20Sopenharmony_ci } 10208c2ecf20Sopenharmony_ci} 10218c2ecf20Sopenharmony_ci 10228c2ecf20Sopenharmony_cistatic int __init cgroup_writeback_init(void) 10238c2ecf20Sopenharmony_ci{ 10248c2ecf20Sopenharmony_ci isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); 10258c2ecf20Sopenharmony_ci if (!isw_wq) 10268c2ecf20Sopenharmony_ci return -ENOMEM; 10278c2ecf20Sopenharmony_ci return 0; 10288c2ecf20Sopenharmony_ci} 10298c2ecf20Sopenharmony_cifs_initcall(cgroup_writeback_init); 10308c2ecf20Sopenharmony_ci 10318c2ecf20Sopenharmony_ci#else /* CONFIG_CGROUP_WRITEBACK */ 10328c2ecf20Sopenharmony_ci 10338c2ecf20Sopenharmony_cistatic void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } 10348c2ecf20Sopenharmony_cistatic void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } 10358c2ecf20Sopenharmony_ci 10368c2ecf20Sopenharmony_cistatic struct bdi_writeback * 10378c2ecf20Sopenharmony_cilocked_inode_to_wb_and_lock_list(struct inode *inode) 10388c2ecf20Sopenharmony_ci __releases(&inode->i_lock) 10398c2ecf20Sopenharmony_ci __acquires(&wb->list_lock) 10408c2ecf20Sopenharmony_ci{ 10418c2ecf20Sopenharmony_ci struct bdi_writeback *wb = inode_to_wb(inode); 10428c2ecf20Sopenharmony_ci 10438c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 10448c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 10458c2ecf20Sopenharmony_ci return wb; 10468c2ecf20Sopenharmony_ci} 10478c2ecf20Sopenharmony_ci 10488c2ecf20Sopenharmony_cistatic struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) 10498c2ecf20Sopenharmony_ci __acquires(&wb->list_lock) 10508c2ecf20Sopenharmony_ci{ 10518c2ecf20Sopenharmony_ci struct bdi_writeback *wb = inode_to_wb(inode); 10528c2ecf20Sopenharmony_ci 10538c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 10548c2ecf20Sopenharmony_ci return wb; 10558c2ecf20Sopenharmony_ci} 10568c2ecf20Sopenharmony_ci 10578c2ecf20Sopenharmony_cistatic long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) 10588c2ecf20Sopenharmony_ci{ 10598c2ecf20Sopenharmony_ci return nr_pages; 10608c2ecf20Sopenharmony_ci} 10618c2ecf20Sopenharmony_ci 10628c2ecf20Sopenharmony_cistatic void bdi_split_work_to_wbs(struct backing_dev_info *bdi, 10638c2ecf20Sopenharmony_ci struct wb_writeback_work *base_work, 10648c2ecf20Sopenharmony_ci bool skip_if_busy) 10658c2ecf20Sopenharmony_ci{ 10668c2ecf20Sopenharmony_ci might_sleep(); 10678c2ecf20Sopenharmony_ci 10688c2ecf20Sopenharmony_ci if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { 10698c2ecf20Sopenharmony_ci base_work->auto_free = 0; 10708c2ecf20Sopenharmony_ci wb_queue_work(&bdi->wb, base_work); 10718c2ecf20Sopenharmony_ci } 10728c2ecf20Sopenharmony_ci} 10738c2ecf20Sopenharmony_ci 10748c2ecf20Sopenharmony_ci#endif /* CONFIG_CGROUP_WRITEBACK */ 10758c2ecf20Sopenharmony_ci 10768c2ecf20Sopenharmony_ci/* 10778c2ecf20Sopenharmony_ci * Add in the number of potentially dirty inodes, because each inode 10788c2ecf20Sopenharmony_ci * write can dirty pagecache in the underlying blockdev. 10798c2ecf20Sopenharmony_ci */ 10808c2ecf20Sopenharmony_cistatic unsigned long get_nr_dirty_pages(void) 10818c2ecf20Sopenharmony_ci{ 10828c2ecf20Sopenharmony_ci return global_node_page_state(NR_FILE_DIRTY) + 10838c2ecf20Sopenharmony_ci get_nr_dirty_inodes(); 10848c2ecf20Sopenharmony_ci} 10858c2ecf20Sopenharmony_ci 10868c2ecf20Sopenharmony_cistatic void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) 10878c2ecf20Sopenharmony_ci{ 10888c2ecf20Sopenharmony_ci if (!wb_has_dirty_io(wb)) 10898c2ecf20Sopenharmony_ci return; 10908c2ecf20Sopenharmony_ci 10918c2ecf20Sopenharmony_ci /* 10928c2ecf20Sopenharmony_ci * All callers of this function want to start writeback of all 10938c2ecf20Sopenharmony_ci * dirty pages. Places like vmscan can call this at a very 10948c2ecf20Sopenharmony_ci * high frequency, causing pointless allocations of tons of 10958c2ecf20Sopenharmony_ci * work items and keeping the flusher threads busy retrieving 10968c2ecf20Sopenharmony_ci * that work. Ensure that we only allow one of them pending and 10978c2ecf20Sopenharmony_ci * inflight at the time. 10988c2ecf20Sopenharmony_ci */ 10998c2ecf20Sopenharmony_ci if (test_bit(WB_start_all, &wb->state) || 11008c2ecf20Sopenharmony_ci test_and_set_bit(WB_start_all, &wb->state)) 11018c2ecf20Sopenharmony_ci return; 11028c2ecf20Sopenharmony_ci 11038c2ecf20Sopenharmony_ci wb->start_all_reason = reason; 11048c2ecf20Sopenharmony_ci wb_wakeup(wb); 11058c2ecf20Sopenharmony_ci} 11068c2ecf20Sopenharmony_ci 11078c2ecf20Sopenharmony_ci/** 11088c2ecf20Sopenharmony_ci * wb_start_background_writeback - start background writeback 11098c2ecf20Sopenharmony_ci * @wb: bdi_writback to write from 11108c2ecf20Sopenharmony_ci * 11118c2ecf20Sopenharmony_ci * Description: 11128c2ecf20Sopenharmony_ci * This makes sure WB_SYNC_NONE background writeback happens. When 11138c2ecf20Sopenharmony_ci * this function returns, it is only guaranteed that for given wb 11148c2ecf20Sopenharmony_ci * some IO is happening if we are over background dirty threshold. 11158c2ecf20Sopenharmony_ci * Caller need not hold sb s_umount semaphore. 11168c2ecf20Sopenharmony_ci */ 11178c2ecf20Sopenharmony_civoid wb_start_background_writeback(struct bdi_writeback *wb) 11188c2ecf20Sopenharmony_ci{ 11198c2ecf20Sopenharmony_ci /* 11208c2ecf20Sopenharmony_ci * We just wake up the flusher thread. It will perform background 11218c2ecf20Sopenharmony_ci * writeback as soon as there is no other work to do. 11228c2ecf20Sopenharmony_ci */ 11238c2ecf20Sopenharmony_ci trace_writeback_wake_background(wb); 11248c2ecf20Sopenharmony_ci wb_wakeup(wb); 11258c2ecf20Sopenharmony_ci} 11268c2ecf20Sopenharmony_ci 11278c2ecf20Sopenharmony_ci/* 11288c2ecf20Sopenharmony_ci * Remove the inode from the writeback list it is on. 11298c2ecf20Sopenharmony_ci */ 11308c2ecf20Sopenharmony_civoid inode_io_list_del(struct inode *inode) 11318c2ecf20Sopenharmony_ci{ 11328c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci wb = inode_to_wb_and_lock_list(inode); 11358c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 11368c2ecf20Sopenharmony_ci inode_io_list_del_locked(inode, wb); 11378c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 11388c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 11398c2ecf20Sopenharmony_ci} 11408c2ecf20Sopenharmony_ciEXPORT_SYMBOL(inode_io_list_del); 11418c2ecf20Sopenharmony_ci 11428c2ecf20Sopenharmony_ci/* 11438c2ecf20Sopenharmony_ci * mark an inode as under writeback on the sb 11448c2ecf20Sopenharmony_ci */ 11458c2ecf20Sopenharmony_civoid sb_mark_inode_writeback(struct inode *inode) 11468c2ecf20Sopenharmony_ci{ 11478c2ecf20Sopenharmony_ci struct super_block *sb = inode->i_sb; 11488c2ecf20Sopenharmony_ci unsigned long flags; 11498c2ecf20Sopenharmony_ci 11508c2ecf20Sopenharmony_ci if (list_empty(&inode->i_wb_list)) { 11518c2ecf20Sopenharmony_ci spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); 11528c2ecf20Sopenharmony_ci if (list_empty(&inode->i_wb_list)) { 11538c2ecf20Sopenharmony_ci list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); 11548c2ecf20Sopenharmony_ci trace_sb_mark_inode_writeback(inode); 11558c2ecf20Sopenharmony_ci } 11568c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); 11578c2ecf20Sopenharmony_ci } 11588c2ecf20Sopenharmony_ci} 11598c2ecf20Sopenharmony_ci 11608c2ecf20Sopenharmony_ci/* 11618c2ecf20Sopenharmony_ci * clear an inode as under writeback on the sb 11628c2ecf20Sopenharmony_ci */ 11638c2ecf20Sopenharmony_civoid sb_clear_inode_writeback(struct inode *inode) 11648c2ecf20Sopenharmony_ci{ 11658c2ecf20Sopenharmony_ci struct super_block *sb = inode->i_sb; 11668c2ecf20Sopenharmony_ci unsigned long flags; 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci if (!list_empty(&inode->i_wb_list)) { 11698c2ecf20Sopenharmony_ci spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); 11708c2ecf20Sopenharmony_ci if (!list_empty(&inode->i_wb_list)) { 11718c2ecf20Sopenharmony_ci list_del_init(&inode->i_wb_list); 11728c2ecf20Sopenharmony_ci trace_sb_clear_inode_writeback(inode); 11738c2ecf20Sopenharmony_ci } 11748c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); 11758c2ecf20Sopenharmony_ci } 11768c2ecf20Sopenharmony_ci} 11778c2ecf20Sopenharmony_ci 11788c2ecf20Sopenharmony_ci/* 11798c2ecf20Sopenharmony_ci * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 11808c2ecf20Sopenharmony_ci * furthest end of its superblock's dirty-inode list. 11818c2ecf20Sopenharmony_ci * 11828c2ecf20Sopenharmony_ci * Before stamping the inode's ->dirtied_when, we check to see whether it is 11838c2ecf20Sopenharmony_ci * already the most-recently-dirtied inode on the b_dirty list. If that is 11848c2ecf20Sopenharmony_ci * the case then the inode must have been redirtied while it was being written 11858c2ecf20Sopenharmony_ci * out and we don't reset its dirtied_when. 11868c2ecf20Sopenharmony_ci */ 11878c2ecf20Sopenharmony_cistatic void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) 11888c2ecf20Sopenharmony_ci{ 11898c2ecf20Sopenharmony_ci assert_spin_locked(&inode->i_lock); 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_ci if (!list_empty(&wb->b_dirty)) { 11928c2ecf20Sopenharmony_ci struct inode *tail; 11938c2ecf20Sopenharmony_ci 11948c2ecf20Sopenharmony_ci tail = wb_inode(wb->b_dirty.next); 11958c2ecf20Sopenharmony_ci if (time_before(inode->dirtied_when, tail->dirtied_when)) 11968c2ecf20Sopenharmony_ci inode->dirtied_when = jiffies; 11978c2ecf20Sopenharmony_ci } 11988c2ecf20Sopenharmony_ci inode_io_list_move_locked(inode, wb, &wb->b_dirty); 11998c2ecf20Sopenharmony_ci inode->i_state &= ~I_SYNC_QUEUED; 12008c2ecf20Sopenharmony_ci} 12018c2ecf20Sopenharmony_ci 12028c2ecf20Sopenharmony_cistatic void redirty_tail(struct inode *inode, struct bdi_writeback *wb) 12038c2ecf20Sopenharmony_ci{ 12048c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 12058c2ecf20Sopenharmony_ci redirty_tail_locked(inode, wb); 12068c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 12078c2ecf20Sopenharmony_ci} 12088c2ecf20Sopenharmony_ci 12098c2ecf20Sopenharmony_ci/* 12108c2ecf20Sopenharmony_ci * requeue inode for re-scanning after bdi->b_io list is exhausted. 12118c2ecf20Sopenharmony_ci */ 12128c2ecf20Sopenharmony_cistatic void requeue_io(struct inode *inode, struct bdi_writeback *wb) 12138c2ecf20Sopenharmony_ci{ 12148c2ecf20Sopenharmony_ci inode_io_list_move_locked(inode, wb, &wb->b_more_io); 12158c2ecf20Sopenharmony_ci} 12168c2ecf20Sopenharmony_ci 12178c2ecf20Sopenharmony_cistatic void inode_sync_complete(struct inode *inode) 12188c2ecf20Sopenharmony_ci{ 12198c2ecf20Sopenharmony_ci inode->i_state &= ~I_SYNC; 12208c2ecf20Sopenharmony_ci /* If inode is clean an unused, put it into LRU now... */ 12218c2ecf20Sopenharmony_ci inode_add_lru(inode); 12228c2ecf20Sopenharmony_ci /* Waiters must see I_SYNC cleared before being woken up */ 12238c2ecf20Sopenharmony_ci smp_mb(); 12248c2ecf20Sopenharmony_ci wake_up_bit(&inode->i_state, __I_SYNC); 12258c2ecf20Sopenharmony_ci} 12268c2ecf20Sopenharmony_ci 12278c2ecf20Sopenharmony_cistatic bool inode_dirtied_after(struct inode *inode, unsigned long t) 12288c2ecf20Sopenharmony_ci{ 12298c2ecf20Sopenharmony_ci bool ret = time_after(inode->dirtied_when, t); 12308c2ecf20Sopenharmony_ci#ifndef CONFIG_64BIT 12318c2ecf20Sopenharmony_ci /* 12328c2ecf20Sopenharmony_ci * For inodes being constantly redirtied, dirtied_when can get stuck. 12338c2ecf20Sopenharmony_ci * It _appears_ to be in the future, but is actually in distant past. 12348c2ecf20Sopenharmony_ci * This test is necessary to prevent such wrapped-around relative times 12358c2ecf20Sopenharmony_ci * from permanently stopping the whole bdi writeback. 12368c2ecf20Sopenharmony_ci */ 12378c2ecf20Sopenharmony_ci ret = ret && time_before_eq(inode->dirtied_when, jiffies); 12388c2ecf20Sopenharmony_ci#endif 12398c2ecf20Sopenharmony_ci return ret; 12408c2ecf20Sopenharmony_ci} 12418c2ecf20Sopenharmony_ci 12428c2ecf20Sopenharmony_ci#define EXPIRE_DIRTY_ATIME 0x0001 12438c2ecf20Sopenharmony_ci 12448c2ecf20Sopenharmony_ci/* 12458c2ecf20Sopenharmony_ci * Move expired (dirtied before dirtied_before) dirty inodes from 12468c2ecf20Sopenharmony_ci * @delaying_queue to @dispatch_queue. 12478c2ecf20Sopenharmony_ci */ 12488c2ecf20Sopenharmony_cistatic int move_expired_inodes(struct list_head *delaying_queue, 12498c2ecf20Sopenharmony_ci struct list_head *dispatch_queue, 12508c2ecf20Sopenharmony_ci unsigned long dirtied_before) 12518c2ecf20Sopenharmony_ci{ 12528c2ecf20Sopenharmony_ci LIST_HEAD(tmp); 12538c2ecf20Sopenharmony_ci struct list_head *pos, *node; 12548c2ecf20Sopenharmony_ci struct super_block *sb = NULL; 12558c2ecf20Sopenharmony_ci struct inode *inode; 12568c2ecf20Sopenharmony_ci int do_sb_sort = 0; 12578c2ecf20Sopenharmony_ci int moved = 0; 12588c2ecf20Sopenharmony_ci 12598c2ecf20Sopenharmony_ci while (!list_empty(delaying_queue)) { 12608c2ecf20Sopenharmony_ci inode = wb_inode(delaying_queue->prev); 12618c2ecf20Sopenharmony_ci if (inode_dirtied_after(inode, dirtied_before)) 12628c2ecf20Sopenharmony_ci break; 12638c2ecf20Sopenharmony_ci list_move(&inode->i_io_list, &tmp); 12648c2ecf20Sopenharmony_ci moved++; 12658c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 12668c2ecf20Sopenharmony_ci inode->i_state |= I_SYNC_QUEUED; 12678c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 12688c2ecf20Sopenharmony_ci if (sb_is_blkdev_sb(inode->i_sb)) 12698c2ecf20Sopenharmony_ci continue; 12708c2ecf20Sopenharmony_ci if (sb && sb != inode->i_sb) 12718c2ecf20Sopenharmony_ci do_sb_sort = 1; 12728c2ecf20Sopenharmony_ci sb = inode->i_sb; 12738c2ecf20Sopenharmony_ci } 12748c2ecf20Sopenharmony_ci 12758c2ecf20Sopenharmony_ci /* just one sb in list, splice to dispatch_queue and we're done */ 12768c2ecf20Sopenharmony_ci if (!do_sb_sort) { 12778c2ecf20Sopenharmony_ci list_splice(&tmp, dispatch_queue); 12788c2ecf20Sopenharmony_ci goto out; 12798c2ecf20Sopenharmony_ci } 12808c2ecf20Sopenharmony_ci 12818c2ecf20Sopenharmony_ci /* Move inodes from one superblock together */ 12828c2ecf20Sopenharmony_ci while (!list_empty(&tmp)) { 12838c2ecf20Sopenharmony_ci sb = wb_inode(tmp.prev)->i_sb; 12848c2ecf20Sopenharmony_ci list_for_each_prev_safe(pos, node, &tmp) { 12858c2ecf20Sopenharmony_ci inode = wb_inode(pos); 12868c2ecf20Sopenharmony_ci if (inode->i_sb == sb) 12878c2ecf20Sopenharmony_ci list_move(&inode->i_io_list, dispatch_queue); 12888c2ecf20Sopenharmony_ci } 12898c2ecf20Sopenharmony_ci } 12908c2ecf20Sopenharmony_ciout: 12918c2ecf20Sopenharmony_ci return moved; 12928c2ecf20Sopenharmony_ci} 12938c2ecf20Sopenharmony_ci 12948c2ecf20Sopenharmony_ci/* 12958c2ecf20Sopenharmony_ci * Queue all expired dirty inodes for io, eldest first. 12968c2ecf20Sopenharmony_ci * Before 12978c2ecf20Sopenharmony_ci * newly dirtied b_dirty b_io b_more_io 12988c2ecf20Sopenharmony_ci * =============> gf edc BA 12998c2ecf20Sopenharmony_ci * After 13008c2ecf20Sopenharmony_ci * newly dirtied b_dirty b_io b_more_io 13018c2ecf20Sopenharmony_ci * =============> g fBAedc 13028c2ecf20Sopenharmony_ci * | 13038c2ecf20Sopenharmony_ci * +--> dequeue for IO 13048c2ecf20Sopenharmony_ci */ 13058c2ecf20Sopenharmony_cistatic void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, 13068c2ecf20Sopenharmony_ci unsigned long dirtied_before) 13078c2ecf20Sopenharmony_ci{ 13088c2ecf20Sopenharmony_ci int moved; 13098c2ecf20Sopenharmony_ci unsigned long time_expire_jif = dirtied_before; 13108c2ecf20Sopenharmony_ci 13118c2ecf20Sopenharmony_ci assert_spin_locked(&wb->list_lock); 13128c2ecf20Sopenharmony_ci list_splice_init(&wb->b_more_io, &wb->b_io); 13138c2ecf20Sopenharmony_ci moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); 13148c2ecf20Sopenharmony_ci if (!work->for_sync) 13158c2ecf20Sopenharmony_ci time_expire_jif = jiffies - dirtytime_expire_interval * HZ; 13168c2ecf20Sopenharmony_ci moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, 13178c2ecf20Sopenharmony_ci time_expire_jif); 13188c2ecf20Sopenharmony_ci if (moved) 13198c2ecf20Sopenharmony_ci wb_io_lists_populated(wb); 13208c2ecf20Sopenharmony_ci trace_writeback_queue_io(wb, work, dirtied_before, moved); 13218c2ecf20Sopenharmony_ci} 13228c2ecf20Sopenharmony_ci 13238c2ecf20Sopenharmony_cistatic int write_inode(struct inode *inode, struct writeback_control *wbc) 13248c2ecf20Sopenharmony_ci{ 13258c2ecf20Sopenharmony_ci int ret; 13268c2ecf20Sopenharmony_ci 13278c2ecf20Sopenharmony_ci if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { 13288c2ecf20Sopenharmony_ci trace_writeback_write_inode_start(inode, wbc); 13298c2ecf20Sopenharmony_ci ret = inode->i_sb->s_op->write_inode(inode, wbc); 13308c2ecf20Sopenharmony_ci trace_writeback_write_inode(inode, wbc); 13318c2ecf20Sopenharmony_ci return ret; 13328c2ecf20Sopenharmony_ci } 13338c2ecf20Sopenharmony_ci return 0; 13348c2ecf20Sopenharmony_ci} 13358c2ecf20Sopenharmony_ci 13368c2ecf20Sopenharmony_ci/* 13378c2ecf20Sopenharmony_ci * Wait for writeback on an inode to complete. Called with i_lock held. 13388c2ecf20Sopenharmony_ci * Caller must make sure inode cannot go away when we drop i_lock. 13398c2ecf20Sopenharmony_ci */ 13408c2ecf20Sopenharmony_cistatic void __inode_wait_for_writeback(struct inode *inode) 13418c2ecf20Sopenharmony_ci __releases(inode->i_lock) 13428c2ecf20Sopenharmony_ci __acquires(inode->i_lock) 13438c2ecf20Sopenharmony_ci{ 13448c2ecf20Sopenharmony_ci DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 13458c2ecf20Sopenharmony_ci wait_queue_head_t *wqh; 13468c2ecf20Sopenharmony_ci 13478c2ecf20Sopenharmony_ci wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 13488c2ecf20Sopenharmony_ci while (inode->i_state & I_SYNC) { 13498c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 13508c2ecf20Sopenharmony_ci __wait_on_bit(wqh, &wq, bit_wait, 13518c2ecf20Sopenharmony_ci TASK_UNINTERRUPTIBLE); 13528c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 13538c2ecf20Sopenharmony_ci } 13548c2ecf20Sopenharmony_ci} 13558c2ecf20Sopenharmony_ci 13568c2ecf20Sopenharmony_ci/* 13578c2ecf20Sopenharmony_ci * Wait for writeback on an inode to complete. Caller must have inode pinned. 13588c2ecf20Sopenharmony_ci */ 13598c2ecf20Sopenharmony_civoid inode_wait_for_writeback(struct inode *inode) 13608c2ecf20Sopenharmony_ci{ 13618c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 13628c2ecf20Sopenharmony_ci __inode_wait_for_writeback(inode); 13638c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 13648c2ecf20Sopenharmony_ci} 13658c2ecf20Sopenharmony_ci 13668c2ecf20Sopenharmony_ci/* 13678c2ecf20Sopenharmony_ci * Sleep until I_SYNC is cleared. This function must be called with i_lock 13688c2ecf20Sopenharmony_ci * held and drops it. It is aimed for callers not holding any inode reference 13698c2ecf20Sopenharmony_ci * so once i_lock is dropped, inode can go away. 13708c2ecf20Sopenharmony_ci */ 13718c2ecf20Sopenharmony_cistatic void inode_sleep_on_writeback(struct inode *inode) 13728c2ecf20Sopenharmony_ci __releases(inode->i_lock) 13738c2ecf20Sopenharmony_ci{ 13748c2ecf20Sopenharmony_ci DEFINE_WAIT(wait); 13758c2ecf20Sopenharmony_ci wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 13768c2ecf20Sopenharmony_ci int sleep; 13778c2ecf20Sopenharmony_ci 13788c2ecf20Sopenharmony_ci prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 13798c2ecf20Sopenharmony_ci sleep = inode->i_state & I_SYNC; 13808c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 13818c2ecf20Sopenharmony_ci if (sleep) 13828c2ecf20Sopenharmony_ci schedule(); 13838c2ecf20Sopenharmony_ci finish_wait(wqh, &wait); 13848c2ecf20Sopenharmony_ci} 13858c2ecf20Sopenharmony_ci 13868c2ecf20Sopenharmony_ci/* 13878c2ecf20Sopenharmony_ci * Find proper writeback list for the inode depending on its current state and 13888c2ecf20Sopenharmony_ci * possibly also change of its state while we were doing writeback. Here we 13898c2ecf20Sopenharmony_ci * handle things such as livelock prevention or fairness of writeback among 13908c2ecf20Sopenharmony_ci * inodes. This function can be called only by flusher thread - noone else 13918c2ecf20Sopenharmony_ci * processes all inodes in writeback lists and requeueing inodes behind flusher 13928c2ecf20Sopenharmony_ci * thread's back can have unexpected consequences. 13938c2ecf20Sopenharmony_ci */ 13948c2ecf20Sopenharmony_cistatic void requeue_inode(struct inode *inode, struct bdi_writeback *wb, 13958c2ecf20Sopenharmony_ci struct writeback_control *wbc) 13968c2ecf20Sopenharmony_ci{ 13978c2ecf20Sopenharmony_ci if (inode->i_state & I_FREEING) 13988c2ecf20Sopenharmony_ci return; 13998c2ecf20Sopenharmony_ci 14008c2ecf20Sopenharmony_ci /* 14018c2ecf20Sopenharmony_ci * Sync livelock prevention. Each inode is tagged and synced in one 14028c2ecf20Sopenharmony_ci * shot. If still dirty, it will be redirty_tail()'ed below. Update 14038c2ecf20Sopenharmony_ci * the dirty time to prevent enqueue and sync it again. 14048c2ecf20Sopenharmony_ci */ 14058c2ecf20Sopenharmony_ci if ((inode->i_state & I_DIRTY) && 14068c2ecf20Sopenharmony_ci (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) 14078c2ecf20Sopenharmony_ci inode->dirtied_when = jiffies; 14088c2ecf20Sopenharmony_ci 14098c2ecf20Sopenharmony_ci if (wbc->pages_skipped) { 14108c2ecf20Sopenharmony_ci /* 14118c2ecf20Sopenharmony_ci * writeback is not making progress due to locked 14128c2ecf20Sopenharmony_ci * buffers. Skip this inode for now. 14138c2ecf20Sopenharmony_ci */ 14148c2ecf20Sopenharmony_ci redirty_tail_locked(inode, wb); 14158c2ecf20Sopenharmony_ci return; 14168c2ecf20Sopenharmony_ci } 14178c2ecf20Sopenharmony_ci 14188c2ecf20Sopenharmony_ci if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 14198c2ecf20Sopenharmony_ci /* 14208c2ecf20Sopenharmony_ci * We didn't write back all the pages. nfs_writepages() 14218c2ecf20Sopenharmony_ci * sometimes bales out without doing anything. 14228c2ecf20Sopenharmony_ci */ 14238c2ecf20Sopenharmony_ci if (wbc->nr_to_write <= 0) { 14248c2ecf20Sopenharmony_ci /* Slice used up. Queue for next turn. */ 14258c2ecf20Sopenharmony_ci requeue_io(inode, wb); 14268c2ecf20Sopenharmony_ci } else { 14278c2ecf20Sopenharmony_ci /* 14288c2ecf20Sopenharmony_ci * Writeback blocked by something other than 14298c2ecf20Sopenharmony_ci * congestion. Delay the inode for some time to 14308c2ecf20Sopenharmony_ci * avoid spinning on the CPU (100% iowait) 14318c2ecf20Sopenharmony_ci * retrying writeback of the dirty page/inode 14328c2ecf20Sopenharmony_ci * that cannot be performed immediately. 14338c2ecf20Sopenharmony_ci */ 14348c2ecf20Sopenharmony_ci redirty_tail_locked(inode, wb); 14358c2ecf20Sopenharmony_ci } 14368c2ecf20Sopenharmony_ci } else if (inode->i_state & I_DIRTY) { 14378c2ecf20Sopenharmony_ci /* 14388c2ecf20Sopenharmony_ci * Filesystems can dirty the inode during writeback operations, 14398c2ecf20Sopenharmony_ci * such as delayed allocation during submission or metadata 14408c2ecf20Sopenharmony_ci * updates after data IO completion. 14418c2ecf20Sopenharmony_ci */ 14428c2ecf20Sopenharmony_ci redirty_tail_locked(inode, wb); 14438c2ecf20Sopenharmony_ci } else if (inode->i_state & I_DIRTY_TIME) { 14448c2ecf20Sopenharmony_ci inode->dirtied_when = jiffies; 14458c2ecf20Sopenharmony_ci inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); 14468c2ecf20Sopenharmony_ci inode->i_state &= ~I_SYNC_QUEUED; 14478c2ecf20Sopenharmony_ci } else { 14488c2ecf20Sopenharmony_ci /* The inode is clean. Remove from writeback lists. */ 14498c2ecf20Sopenharmony_ci inode_io_list_del_locked(inode, wb); 14508c2ecf20Sopenharmony_ci } 14518c2ecf20Sopenharmony_ci} 14528c2ecf20Sopenharmony_ci 14538c2ecf20Sopenharmony_ci/* 14548c2ecf20Sopenharmony_ci * Write out an inode and its dirty pages. Do not update the writeback list 14558c2ecf20Sopenharmony_ci * linkage. That is left to the caller. The caller is also responsible for 14568c2ecf20Sopenharmony_ci * setting I_SYNC flag and calling inode_sync_complete() to clear it. 14578c2ecf20Sopenharmony_ci */ 14588c2ecf20Sopenharmony_cistatic int 14598c2ecf20Sopenharmony_ci__writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 14608c2ecf20Sopenharmony_ci{ 14618c2ecf20Sopenharmony_ci struct address_space *mapping = inode->i_mapping; 14628c2ecf20Sopenharmony_ci long nr_to_write = wbc->nr_to_write; 14638c2ecf20Sopenharmony_ci unsigned dirty; 14648c2ecf20Sopenharmony_ci int ret; 14658c2ecf20Sopenharmony_ci 14668c2ecf20Sopenharmony_ci WARN_ON(!(inode->i_state & I_SYNC)); 14678c2ecf20Sopenharmony_ci 14688c2ecf20Sopenharmony_ci trace_writeback_single_inode_start(inode, wbc, nr_to_write); 14698c2ecf20Sopenharmony_ci 14708c2ecf20Sopenharmony_ci ret = do_writepages(mapping, wbc); 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_ci /* 14738c2ecf20Sopenharmony_ci * Make sure to wait on the data before writing out the metadata. 14748c2ecf20Sopenharmony_ci * This is important for filesystems that modify metadata on data 14758c2ecf20Sopenharmony_ci * I/O completion. We don't do it for sync(2) writeback because it has a 14768c2ecf20Sopenharmony_ci * separate, external IO completion path and ->sync_fs for guaranteeing 14778c2ecf20Sopenharmony_ci * inode metadata is written back correctly. 14788c2ecf20Sopenharmony_ci */ 14798c2ecf20Sopenharmony_ci if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { 14808c2ecf20Sopenharmony_ci int err = filemap_fdatawait(mapping); 14818c2ecf20Sopenharmony_ci if (ret == 0) 14828c2ecf20Sopenharmony_ci ret = err; 14838c2ecf20Sopenharmony_ci } 14848c2ecf20Sopenharmony_ci 14858c2ecf20Sopenharmony_ci /* 14868c2ecf20Sopenharmony_ci * If the inode has dirty timestamps and we need to write them, call 14878c2ecf20Sopenharmony_ci * mark_inode_dirty_sync() to notify the filesystem about it and to 14888c2ecf20Sopenharmony_ci * change I_DIRTY_TIME into I_DIRTY_SYNC. 14898c2ecf20Sopenharmony_ci */ 14908c2ecf20Sopenharmony_ci if ((inode->i_state & I_DIRTY_TIME) && 14918c2ecf20Sopenharmony_ci (wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || 14928c2ecf20Sopenharmony_ci time_after(jiffies, inode->dirtied_time_when + 14938c2ecf20Sopenharmony_ci dirtytime_expire_interval * HZ))) { 14948c2ecf20Sopenharmony_ci trace_writeback_lazytime(inode); 14958c2ecf20Sopenharmony_ci mark_inode_dirty_sync(inode); 14968c2ecf20Sopenharmony_ci } 14978c2ecf20Sopenharmony_ci 14988c2ecf20Sopenharmony_ci /* 14998c2ecf20Sopenharmony_ci * Some filesystems may redirty the inode during the writeback 15008c2ecf20Sopenharmony_ci * due to delalloc, clear dirty metadata flags right before 15018c2ecf20Sopenharmony_ci * write_inode() 15028c2ecf20Sopenharmony_ci */ 15038c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 15048c2ecf20Sopenharmony_ci dirty = inode->i_state & I_DIRTY; 15058c2ecf20Sopenharmony_ci inode->i_state &= ~dirty; 15068c2ecf20Sopenharmony_ci 15078c2ecf20Sopenharmony_ci /* 15088c2ecf20Sopenharmony_ci * Paired with smp_mb() in __mark_inode_dirty(). This allows 15098c2ecf20Sopenharmony_ci * __mark_inode_dirty() to test i_state without grabbing i_lock - 15108c2ecf20Sopenharmony_ci * either they see the I_DIRTY bits cleared or we see the dirtied 15118c2ecf20Sopenharmony_ci * inode. 15128c2ecf20Sopenharmony_ci * 15138c2ecf20Sopenharmony_ci * I_DIRTY_PAGES is always cleared together above even if @mapping 15148c2ecf20Sopenharmony_ci * still has dirty pages. The flag is reinstated after smp_mb() if 15158c2ecf20Sopenharmony_ci * necessary. This guarantees that either __mark_inode_dirty() 15168c2ecf20Sopenharmony_ci * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. 15178c2ecf20Sopenharmony_ci */ 15188c2ecf20Sopenharmony_ci smp_mb(); 15198c2ecf20Sopenharmony_ci 15208c2ecf20Sopenharmony_ci if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 15218c2ecf20Sopenharmony_ci inode->i_state |= I_DIRTY_PAGES; 15228c2ecf20Sopenharmony_ci 15238c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 15248c2ecf20Sopenharmony_ci 15258c2ecf20Sopenharmony_ci /* Don't write the inode if only I_DIRTY_PAGES was set */ 15268c2ecf20Sopenharmony_ci if (dirty & ~I_DIRTY_PAGES) { 15278c2ecf20Sopenharmony_ci int err = write_inode(inode, wbc); 15288c2ecf20Sopenharmony_ci if (ret == 0) 15298c2ecf20Sopenharmony_ci ret = err; 15308c2ecf20Sopenharmony_ci } 15318c2ecf20Sopenharmony_ci trace_writeback_single_inode(inode, wbc, nr_to_write); 15328c2ecf20Sopenharmony_ci return ret; 15338c2ecf20Sopenharmony_ci} 15348c2ecf20Sopenharmony_ci 15358c2ecf20Sopenharmony_ci/* 15368c2ecf20Sopenharmony_ci * Write out an inode's dirty pages. Either the caller has an active reference 15378c2ecf20Sopenharmony_ci * on the inode or the inode has I_WILL_FREE set. 15388c2ecf20Sopenharmony_ci * 15398c2ecf20Sopenharmony_ci * This function is designed to be called for writing back one inode which 15408c2ecf20Sopenharmony_ci * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() 15418c2ecf20Sopenharmony_ci * and does more profound writeback list handling in writeback_sb_inodes(). 15428c2ecf20Sopenharmony_ci */ 15438c2ecf20Sopenharmony_cistatic int writeback_single_inode(struct inode *inode, 15448c2ecf20Sopenharmony_ci struct writeback_control *wbc) 15458c2ecf20Sopenharmony_ci{ 15468c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 15478c2ecf20Sopenharmony_ci int ret = 0; 15488c2ecf20Sopenharmony_ci 15498c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 15508c2ecf20Sopenharmony_ci if (!atomic_read(&inode->i_count)) 15518c2ecf20Sopenharmony_ci WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 15528c2ecf20Sopenharmony_ci else 15538c2ecf20Sopenharmony_ci WARN_ON(inode->i_state & I_WILL_FREE); 15548c2ecf20Sopenharmony_ci 15558c2ecf20Sopenharmony_ci if (inode->i_state & I_SYNC) { 15568c2ecf20Sopenharmony_ci if (wbc->sync_mode != WB_SYNC_ALL) 15578c2ecf20Sopenharmony_ci goto out; 15588c2ecf20Sopenharmony_ci /* 15598c2ecf20Sopenharmony_ci * It's a data-integrity sync. We must wait. Since callers hold 15608c2ecf20Sopenharmony_ci * inode reference or inode has I_WILL_FREE set, it cannot go 15618c2ecf20Sopenharmony_ci * away under us. 15628c2ecf20Sopenharmony_ci */ 15638c2ecf20Sopenharmony_ci __inode_wait_for_writeback(inode); 15648c2ecf20Sopenharmony_ci } 15658c2ecf20Sopenharmony_ci WARN_ON(inode->i_state & I_SYNC); 15668c2ecf20Sopenharmony_ci /* 15678c2ecf20Sopenharmony_ci * Skip inode if it is clean and we have no outstanding writeback in 15688c2ecf20Sopenharmony_ci * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this 15698c2ecf20Sopenharmony_ci * function since flusher thread may be doing for example sync in 15708c2ecf20Sopenharmony_ci * parallel and if we move the inode, it could get skipped. So here we 15718c2ecf20Sopenharmony_ci * make sure inode is on some writeback list and leave it there unless 15728c2ecf20Sopenharmony_ci * we have completely cleaned the inode. 15738c2ecf20Sopenharmony_ci */ 15748c2ecf20Sopenharmony_ci if (!(inode->i_state & I_DIRTY_ALL) && 15758c2ecf20Sopenharmony_ci (wbc->sync_mode != WB_SYNC_ALL || 15768c2ecf20Sopenharmony_ci !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) 15778c2ecf20Sopenharmony_ci goto out; 15788c2ecf20Sopenharmony_ci inode->i_state |= I_SYNC; 15798c2ecf20Sopenharmony_ci wbc_attach_and_unlock_inode(wbc, inode); 15808c2ecf20Sopenharmony_ci 15818c2ecf20Sopenharmony_ci ret = __writeback_single_inode(inode, wbc); 15828c2ecf20Sopenharmony_ci 15838c2ecf20Sopenharmony_ci wbc_detach_inode(wbc); 15848c2ecf20Sopenharmony_ci 15858c2ecf20Sopenharmony_ci wb = inode_to_wb_and_lock_list(inode); 15868c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 15878c2ecf20Sopenharmony_ci /* 15888c2ecf20Sopenharmony_ci * If inode is clean, remove it from writeback lists. Otherwise don't 15898c2ecf20Sopenharmony_ci * touch it. See comment above for explanation. 15908c2ecf20Sopenharmony_ci */ 15918c2ecf20Sopenharmony_ci if (!(inode->i_state & I_DIRTY_ALL)) 15928c2ecf20Sopenharmony_ci inode_io_list_del_locked(inode, wb); 15938c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 15948c2ecf20Sopenharmony_ci inode_sync_complete(inode); 15958c2ecf20Sopenharmony_ciout: 15968c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 15978c2ecf20Sopenharmony_ci return ret; 15988c2ecf20Sopenharmony_ci} 15998c2ecf20Sopenharmony_ci 16008c2ecf20Sopenharmony_cistatic long writeback_chunk_size(struct bdi_writeback *wb, 16018c2ecf20Sopenharmony_ci struct wb_writeback_work *work) 16028c2ecf20Sopenharmony_ci{ 16038c2ecf20Sopenharmony_ci long pages; 16048c2ecf20Sopenharmony_ci 16058c2ecf20Sopenharmony_ci /* 16068c2ecf20Sopenharmony_ci * WB_SYNC_ALL mode does livelock avoidance by syncing dirty 16078c2ecf20Sopenharmony_ci * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX 16088c2ecf20Sopenharmony_ci * here avoids calling into writeback_inodes_wb() more than once. 16098c2ecf20Sopenharmony_ci * 16108c2ecf20Sopenharmony_ci * The intended call sequence for WB_SYNC_ALL writeback is: 16118c2ecf20Sopenharmony_ci * 16128c2ecf20Sopenharmony_ci * wb_writeback() 16138c2ecf20Sopenharmony_ci * writeback_sb_inodes() <== called only once 16148c2ecf20Sopenharmony_ci * write_cache_pages() <== called once for each inode 16158c2ecf20Sopenharmony_ci * (quickly) tag currently dirty pages 16168c2ecf20Sopenharmony_ci * (maybe slowly) sync all tagged pages 16178c2ecf20Sopenharmony_ci */ 16188c2ecf20Sopenharmony_ci if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) 16198c2ecf20Sopenharmony_ci pages = LONG_MAX; 16208c2ecf20Sopenharmony_ci else { 16218c2ecf20Sopenharmony_ci pages = min(wb->avg_write_bandwidth / 2, 16228c2ecf20Sopenharmony_ci global_wb_domain.dirty_limit / DIRTY_SCOPE); 16238c2ecf20Sopenharmony_ci pages = min(pages, work->nr_pages); 16248c2ecf20Sopenharmony_ci pages = round_down(pages + MIN_WRITEBACK_PAGES, 16258c2ecf20Sopenharmony_ci MIN_WRITEBACK_PAGES); 16268c2ecf20Sopenharmony_ci } 16278c2ecf20Sopenharmony_ci 16288c2ecf20Sopenharmony_ci return pages; 16298c2ecf20Sopenharmony_ci} 16308c2ecf20Sopenharmony_ci 16318c2ecf20Sopenharmony_ci/* 16328c2ecf20Sopenharmony_ci * Write a portion of b_io inodes which belong to @sb. 16338c2ecf20Sopenharmony_ci * 16348c2ecf20Sopenharmony_ci * Return the number of pages and/or inodes written. 16358c2ecf20Sopenharmony_ci * 16368c2ecf20Sopenharmony_ci * NOTE! This is called with wb->list_lock held, and will 16378c2ecf20Sopenharmony_ci * unlock and relock that for each inode it ends up doing 16388c2ecf20Sopenharmony_ci * IO for. 16398c2ecf20Sopenharmony_ci */ 16408c2ecf20Sopenharmony_cistatic long writeback_sb_inodes(struct super_block *sb, 16418c2ecf20Sopenharmony_ci struct bdi_writeback *wb, 16428c2ecf20Sopenharmony_ci struct wb_writeback_work *work) 16438c2ecf20Sopenharmony_ci{ 16448c2ecf20Sopenharmony_ci struct writeback_control wbc = { 16458c2ecf20Sopenharmony_ci .sync_mode = work->sync_mode, 16468c2ecf20Sopenharmony_ci .tagged_writepages = work->tagged_writepages, 16478c2ecf20Sopenharmony_ci .for_kupdate = work->for_kupdate, 16488c2ecf20Sopenharmony_ci .for_background = work->for_background, 16498c2ecf20Sopenharmony_ci .for_sync = work->for_sync, 16508c2ecf20Sopenharmony_ci .range_cyclic = work->range_cyclic, 16518c2ecf20Sopenharmony_ci .range_start = 0, 16528c2ecf20Sopenharmony_ci .range_end = LLONG_MAX, 16538c2ecf20Sopenharmony_ci }; 16548c2ecf20Sopenharmony_ci unsigned long start_time = jiffies; 16558c2ecf20Sopenharmony_ci long write_chunk; 16568c2ecf20Sopenharmony_ci long total_wrote = 0; /* count both pages and inodes */ 16578c2ecf20Sopenharmony_ci 16588c2ecf20Sopenharmony_ci while (!list_empty(&wb->b_io)) { 16598c2ecf20Sopenharmony_ci struct inode *inode = wb_inode(wb->b_io.prev); 16608c2ecf20Sopenharmony_ci struct bdi_writeback *tmp_wb; 16618c2ecf20Sopenharmony_ci long wrote; 16628c2ecf20Sopenharmony_ci 16638c2ecf20Sopenharmony_ci if (inode->i_sb != sb) { 16648c2ecf20Sopenharmony_ci if (work->sb) { 16658c2ecf20Sopenharmony_ci /* 16668c2ecf20Sopenharmony_ci * We only want to write back data for this 16678c2ecf20Sopenharmony_ci * superblock, move all inodes not belonging 16688c2ecf20Sopenharmony_ci * to it back onto the dirty list. 16698c2ecf20Sopenharmony_ci */ 16708c2ecf20Sopenharmony_ci redirty_tail(inode, wb); 16718c2ecf20Sopenharmony_ci continue; 16728c2ecf20Sopenharmony_ci } 16738c2ecf20Sopenharmony_ci 16748c2ecf20Sopenharmony_ci /* 16758c2ecf20Sopenharmony_ci * The inode belongs to a different superblock. 16768c2ecf20Sopenharmony_ci * Bounce back to the caller to unpin this and 16778c2ecf20Sopenharmony_ci * pin the next superblock. 16788c2ecf20Sopenharmony_ci */ 16798c2ecf20Sopenharmony_ci break; 16808c2ecf20Sopenharmony_ci } 16818c2ecf20Sopenharmony_ci 16828c2ecf20Sopenharmony_ci /* 16838c2ecf20Sopenharmony_ci * Don't bother with new inodes or inodes being freed, first 16848c2ecf20Sopenharmony_ci * kind does not need periodic writeout yet, and for the latter 16858c2ecf20Sopenharmony_ci * kind writeout is handled by the freer. 16868c2ecf20Sopenharmony_ci */ 16878c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 16888c2ecf20Sopenharmony_ci if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 16898c2ecf20Sopenharmony_ci redirty_tail_locked(inode, wb); 16908c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 16918c2ecf20Sopenharmony_ci continue; 16928c2ecf20Sopenharmony_ci } 16938c2ecf20Sopenharmony_ci if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { 16948c2ecf20Sopenharmony_ci /* 16958c2ecf20Sopenharmony_ci * If this inode is locked for writeback and we are not 16968c2ecf20Sopenharmony_ci * doing writeback-for-data-integrity, move it to 16978c2ecf20Sopenharmony_ci * b_more_io so that writeback can proceed with the 16988c2ecf20Sopenharmony_ci * other inodes on s_io. 16998c2ecf20Sopenharmony_ci * 17008c2ecf20Sopenharmony_ci * We'll have another go at writing back this inode 17018c2ecf20Sopenharmony_ci * when we completed a full scan of b_io. 17028c2ecf20Sopenharmony_ci */ 17038c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 17048c2ecf20Sopenharmony_ci requeue_io(inode, wb); 17058c2ecf20Sopenharmony_ci trace_writeback_sb_inodes_requeue(inode); 17068c2ecf20Sopenharmony_ci continue; 17078c2ecf20Sopenharmony_ci } 17088c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 17098c2ecf20Sopenharmony_ci 17108c2ecf20Sopenharmony_ci /* 17118c2ecf20Sopenharmony_ci * We already requeued the inode if it had I_SYNC set and we 17128c2ecf20Sopenharmony_ci * are doing WB_SYNC_NONE writeback. So this catches only the 17138c2ecf20Sopenharmony_ci * WB_SYNC_ALL case. 17148c2ecf20Sopenharmony_ci */ 17158c2ecf20Sopenharmony_ci if (inode->i_state & I_SYNC) { 17168c2ecf20Sopenharmony_ci /* Wait for I_SYNC. This function drops i_lock... */ 17178c2ecf20Sopenharmony_ci inode_sleep_on_writeback(inode); 17188c2ecf20Sopenharmony_ci /* Inode may be gone, start again */ 17198c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 17208c2ecf20Sopenharmony_ci continue; 17218c2ecf20Sopenharmony_ci } 17228c2ecf20Sopenharmony_ci inode->i_state |= I_SYNC; 17238c2ecf20Sopenharmony_ci wbc_attach_and_unlock_inode(&wbc, inode); 17248c2ecf20Sopenharmony_ci 17258c2ecf20Sopenharmony_ci write_chunk = writeback_chunk_size(wb, work); 17268c2ecf20Sopenharmony_ci wbc.nr_to_write = write_chunk; 17278c2ecf20Sopenharmony_ci wbc.pages_skipped = 0; 17288c2ecf20Sopenharmony_ci 17298c2ecf20Sopenharmony_ci /* 17308c2ecf20Sopenharmony_ci * We use I_SYNC to pin the inode in memory. While it is set 17318c2ecf20Sopenharmony_ci * evict_inode() will wait so the inode cannot be freed. 17328c2ecf20Sopenharmony_ci */ 17338c2ecf20Sopenharmony_ci __writeback_single_inode(inode, &wbc); 17348c2ecf20Sopenharmony_ci 17358c2ecf20Sopenharmony_ci wbc_detach_inode(&wbc); 17368c2ecf20Sopenharmony_ci work->nr_pages -= write_chunk - wbc.nr_to_write; 17378c2ecf20Sopenharmony_ci wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; 17388c2ecf20Sopenharmony_ci wrote = wrote < 0 ? 0 : wrote; 17398c2ecf20Sopenharmony_ci total_wrote += wrote; 17408c2ecf20Sopenharmony_ci 17418c2ecf20Sopenharmony_ci if (need_resched()) { 17428c2ecf20Sopenharmony_ci /* 17438c2ecf20Sopenharmony_ci * We're trying to balance between building up a nice 17448c2ecf20Sopenharmony_ci * long list of IOs to improve our merge rate, and 17458c2ecf20Sopenharmony_ci * getting those IOs out quickly for anyone throttling 17468c2ecf20Sopenharmony_ci * in balance_dirty_pages(). cond_resched() doesn't 17478c2ecf20Sopenharmony_ci * unplug, so get our IOs out the door before we 17488c2ecf20Sopenharmony_ci * give up the CPU. 17498c2ecf20Sopenharmony_ci */ 17508c2ecf20Sopenharmony_ci blk_flush_plug(current); 17518c2ecf20Sopenharmony_ci cond_resched(); 17528c2ecf20Sopenharmony_ci } 17538c2ecf20Sopenharmony_ci 17548c2ecf20Sopenharmony_ci /* 17558c2ecf20Sopenharmony_ci * Requeue @inode if still dirty. Be careful as @inode may 17568c2ecf20Sopenharmony_ci * have been switched to another wb in the meantime. 17578c2ecf20Sopenharmony_ci */ 17588c2ecf20Sopenharmony_ci tmp_wb = inode_to_wb_and_lock_list(inode); 17598c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 17608c2ecf20Sopenharmony_ci if (!(inode->i_state & I_DIRTY_ALL)) 17618c2ecf20Sopenharmony_ci total_wrote++; 17628c2ecf20Sopenharmony_ci requeue_inode(inode, tmp_wb, &wbc); 17638c2ecf20Sopenharmony_ci inode_sync_complete(inode); 17648c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 17658c2ecf20Sopenharmony_ci 17668c2ecf20Sopenharmony_ci if (unlikely(tmp_wb != wb)) { 17678c2ecf20Sopenharmony_ci spin_unlock(&tmp_wb->list_lock); 17688c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 17698c2ecf20Sopenharmony_ci } 17708c2ecf20Sopenharmony_ci 17718c2ecf20Sopenharmony_ci /* 17728c2ecf20Sopenharmony_ci * bail out to wb_writeback() often enough to check 17738c2ecf20Sopenharmony_ci * background threshold and other termination conditions. 17748c2ecf20Sopenharmony_ci */ 17758c2ecf20Sopenharmony_ci if (total_wrote) { 17768c2ecf20Sopenharmony_ci if (time_is_before_jiffies(start_time + HZ / 10UL)) 17778c2ecf20Sopenharmony_ci break; 17788c2ecf20Sopenharmony_ci if (work->nr_pages <= 0) 17798c2ecf20Sopenharmony_ci break; 17808c2ecf20Sopenharmony_ci } 17818c2ecf20Sopenharmony_ci } 17828c2ecf20Sopenharmony_ci return total_wrote; 17838c2ecf20Sopenharmony_ci} 17848c2ecf20Sopenharmony_ci 17858c2ecf20Sopenharmony_cistatic long __writeback_inodes_wb(struct bdi_writeback *wb, 17868c2ecf20Sopenharmony_ci struct wb_writeback_work *work) 17878c2ecf20Sopenharmony_ci{ 17888c2ecf20Sopenharmony_ci unsigned long start_time = jiffies; 17898c2ecf20Sopenharmony_ci long wrote = 0; 17908c2ecf20Sopenharmony_ci 17918c2ecf20Sopenharmony_ci while (!list_empty(&wb->b_io)) { 17928c2ecf20Sopenharmony_ci struct inode *inode = wb_inode(wb->b_io.prev); 17938c2ecf20Sopenharmony_ci struct super_block *sb = inode->i_sb; 17948c2ecf20Sopenharmony_ci 17958c2ecf20Sopenharmony_ci if (!trylock_super(sb)) { 17968c2ecf20Sopenharmony_ci /* 17978c2ecf20Sopenharmony_ci * trylock_super() may fail consistently due to 17988c2ecf20Sopenharmony_ci * s_umount being grabbed by someone else. Don't use 17998c2ecf20Sopenharmony_ci * requeue_io() to avoid busy retrying the inode/sb. 18008c2ecf20Sopenharmony_ci */ 18018c2ecf20Sopenharmony_ci redirty_tail(inode, wb); 18028c2ecf20Sopenharmony_ci continue; 18038c2ecf20Sopenharmony_ci } 18048c2ecf20Sopenharmony_ci wrote += writeback_sb_inodes(sb, wb, work); 18058c2ecf20Sopenharmony_ci up_read(&sb->s_umount); 18068c2ecf20Sopenharmony_ci 18078c2ecf20Sopenharmony_ci /* refer to the same tests at the end of writeback_sb_inodes */ 18088c2ecf20Sopenharmony_ci if (wrote) { 18098c2ecf20Sopenharmony_ci if (time_is_before_jiffies(start_time + HZ / 10UL)) 18108c2ecf20Sopenharmony_ci break; 18118c2ecf20Sopenharmony_ci if (work->nr_pages <= 0) 18128c2ecf20Sopenharmony_ci break; 18138c2ecf20Sopenharmony_ci } 18148c2ecf20Sopenharmony_ci } 18158c2ecf20Sopenharmony_ci /* Leave any unwritten inodes on b_io */ 18168c2ecf20Sopenharmony_ci return wrote; 18178c2ecf20Sopenharmony_ci} 18188c2ecf20Sopenharmony_ci 18198c2ecf20Sopenharmony_cistatic long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, 18208c2ecf20Sopenharmony_ci enum wb_reason reason) 18218c2ecf20Sopenharmony_ci{ 18228c2ecf20Sopenharmony_ci struct wb_writeback_work work = { 18238c2ecf20Sopenharmony_ci .nr_pages = nr_pages, 18248c2ecf20Sopenharmony_ci .sync_mode = WB_SYNC_NONE, 18258c2ecf20Sopenharmony_ci .range_cyclic = 1, 18268c2ecf20Sopenharmony_ci .reason = reason, 18278c2ecf20Sopenharmony_ci }; 18288c2ecf20Sopenharmony_ci struct blk_plug plug; 18298c2ecf20Sopenharmony_ci 18308c2ecf20Sopenharmony_ci blk_start_plug(&plug); 18318c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 18328c2ecf20Sopenharmony_ci if (list_empty(&wb->b_io)) 18338c2ecf20Sopenharmony_ci queue_io(wb, &work, jiffies); 18348c2ecf20Sopenharmony_ci __writeback_inodes_wb(wb, &work); 18358c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 18368c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 18378c2ecf20Sopenharmony_ci 18388c2ecf20Sopenharmony_ci return nr_pages - work.nr_pages; 18398c2ecf20Sopenharmony_ci} 18408c2ecf20Sopenharmony_ci 18418c2ecf20Sopenharmony_ci/* 18428c2ecf20Sopenharmony_ci * Explicit flushing or periodic writeback of "old" data. 18438c2ecf20Sopenharmony_ci * 18448c2ecf20Sopenharmony_ci * Define "old": the first time one of an inode's pages is dirtied, we mark the 18458c2ecf20Sopenharmony_ci * dirtying-time in the inode's address_space. So this periodic writeback code 18468c2ecf20Sopenharmony_ci * just walks the superblock inode list, writing back any inodes which are 18478c2ecf20Sopenharmony_ci * older than a specific point in time. 18488c2ecf20Sopenharmony_ci * 18498c2ecf20Sopenharmony_ci * Try to run once per dirty_writeback_interval. But if a writeback event 18508c2ecf20Sopenharmony_ci * takes longer than a dirty_writeback_interval interval, then leave a 18518c2ecf20Sopenharmony_ci * one-second gap. 18528c2ecf20Sopenharmony_ci * 18538c2ecf20Sopenharmony_ci * dirtied_before takes precedence over nr_to_write. So we'll only write back 18548c2ecf20Sopenharmony_ci * all dirty pages if they are all attached to "old" mappings. 18558c2ecf20Sopenharmony_ci */ 18568c2ecf20Sopenharmony_cistatic long wb_writeback(struct bdi_writeback *wb, 18578c2ecf20Sopenharmony_ci struct wb_writeback_work *work) 18588c2ecf20Sopenharmony_ci{ 18598c2ecf20Sopenharmony_ci unsigned long wb_start = jiffies; 18608c2ecf20Sopenharmony_ci long nr_pages = work->nr_pages; 18618c2ecf20Sopenharmony_ci unsigned long dirtied_before = jiffies; 18628c2ecf20Sopenharmony_ci struct inode *inode; 18638c2ecf20Sopenharmony_ci long progress; 18648c2ecf20Sopenharmony_ci struct blk_plug plug; 18658c2ecf20Sopenharmony_ci 18668c2ecf20Sopenharmony_ci blk_start_plug(&plug); 18678c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 18688c2ecf20Sopenharmony_ci for (;;) { 18698c2ecf20Sopenharmony_ci /* 18708c2ecf20Sopenharmony_ci * Stop writeback when nr_pages has been consumed 18718c2ecf20Sopenharmony_ci */ 18728c2ecf20Sopenharmony_ci if (work->nr_pages <= 0) 18738c2ecf20Sopenharmony_ci break; 18748c2ecf20Sopenharmony_ci 18758c2ecf20Sopenharmony_ci /* 18768c2ecf20Sopenharmony_ci * Background writeout and kupdate-style writeback may 18778c2ecf20Sopenharmony_ci * run forever. Stop them if there is other work to do 18788c2ecf20Sopenharmony_ci * so that e.g. sync can proceed. They'll be restarted 18798c2ecf20Sopenharmony_ci * after the other works are all done. 18808c2ecf20Sopenharmony_ci */ 18818c2ecf20Sopenharmony_ci if ((work->for_background || work->for_kupdate) && 18828c2ecf20Sopenharmony_ci !list_empty(&wb->work_list)) 18838c2ecf20Sopenharmony_ci break; 18848c2ecf20Sopenharmony_ci 18858c2ecf20Sopenharmony_ci /* 18868c2ecf20Sopenharmony_ci * For background writeout, stop when we are below the 18878c2ecf20Sopenharmony_ci * background dirty threshold 18888c2ecf20Sopenharmony_ci */ 18898c2ecf20Sopenharmony_ci if (work->for_background && !wb_over_bg_thresh(wb)) 18908c2ecf20Sopenharmony_ci break; 18918c2ecf20Sopenharmony_ci 18928c2ecf20Sopenharmony_ci /* 18938c2ecf20Sopenharmony_ci * Kupdate and background works are special and we want to 18948c2ecf20Sopenharmony_ci * include all inodes that need writing. Livelock avoidance is 18958c2ecf20Sopenharmony_ci * handled by these works yielding to any other work so we are 18968c2ecf20Sopenharmony_ci * safe. 18978c2ecf20Sopenharmony_ci */ 18988c2ecf20Sopenharmony_ci if (work->for_kupdate) { 18998c2ecf20Sopenharmony_ci dirtied_before = jiffies - 19008c2ecf20Sopenharmony_ci msecs_to_jiffies(dirty_expire_interval * 10); 19018c2ecf20Sopenharmony_ci } else if (work->for_background) 19028c2ecf20Sopenharmony_ci dirtied_before = jiffies; 19038c2ecf20Sopenharmony_ci 19048c2ecf20Sopenharmony_ci trace_writeback_start(wb, work); 19058c2ecf20Sopenharmony_ci if (list_empty(&wb->b_io)) 19068c2ecf20Sopenharmony_ci queue_io(wb, work, dirtied_before); 19078c2ecf20Sopenharmony_ci if (work->sb) 19088c2ecf20Sopenharmony_ci progress = writeback_sb_inodes(work->sb, wb, work); 19098c2ecf20Sopenharmony_ci else 19108c2ecf20Sopenharmony_ci progress = __writeback_inodes_wb(wb, work); 19118c2ecf20Sopenharmony_ci trace_writeback_written(wb, work); 19128c2ecf20Sopenharmony_ci 19138c2ecf20Sopenharmony_ci wb_update_bandwidth(wb, wb_start); 19148c2ecf20Sopenharmony_ci 19158c2ecf20Sopenharmony_ci /* 19168c2ecf20Sopenharmony_ci * Did we write something? Try for more 19178c2ecf20Sopenharmony_ci * 19188c2ecf20Sopenharmony_ci * Dirty inodes are moved to b_io for writeback in batches. 19198c2ecf20Sopenharmony_ci * The completion of the current batch does not necessarily 19208c2ecf20Sopenharmony_ci * mean the overall work is done. So we keep looping as long 19218c2ecf20Sopenharmony_ci * as made some progress on cleaning pages or inodes. 19228c2ecf20Sopenharmony_ci */ 19238c2ecf20Sopenharmony_ci if (progress) 19248c2ecf20Sopenharmony_ci continue; 19258c2ecf20Sopenharmony_ci /* 19268c2ecf20Sopenharmony_ci * No more inodes for IO, bail 19278c2ecf20Sopenharmony_ci */ 19288c2ecf20Sopenharmony_ci if (list_empty(&wb->b_more_io)) 19298c2ecf20Sopenharmony_ci break; 19308c2ecf20Sopenharmony_ci /* 19318c2ecf20Sopenharmony_ci * Nothing written. Wait for some inode to 19328c2ecf20Sopenharmony_ci * become available for writeback. Otherwise 19338c2ecf20Sopenharmony_ci * we'll just busyloop. 19348c2ecf20Sopenharmony_ci */ 19358c2ecf20Sopenharmony_ci trace_writeback_wait(wb, work); 19368c2ecf20Sopenharmony_ci inode = wb_inode(wb->b_more_io.prev); 19378c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 19388c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 19398c2ecf20Sopenharmony_ci /* This function drops i_lock... */ 19408c2ecf20Sopenharmony_ci inode_sleep_on_writeback(inode); 19418c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 19428c2ecf20Sopenharmony_ci } 19438c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 19448c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 19458c2ecf20Sopenharmony_ci 19468c2ecf20Sopenharmony_ci return nr_pages - work->nr_pages; 19478c2ecf20Sopenharmony_ci} 19488c2ecf20Sopenharmony_ci 19498c2ecf20Sopenharmony_ci/* 19508c2ecf20Sopenharmony_ci * Return the next wb_writeback_work struct that hasn't been processed yet. 19518c2ecf20Sopenharmony_ci */ 19528c2ecf20Sopenharmony_cistatic struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) 19538c2ecf20Sopenharmony_ci{ 19548c2ecf20Sopenharmony_ci struct wb_writeback_work *work = NULL; 19558c2ecf20Sopenharmony_ci 19568c2ecf20Sopenharmony_ci spin_lock_bh(&wb->work_lock); 19578c2ecf20Sopenharmony_ci if (!list_empty(&wb->work_list)) { 19588c2ecf20Sopenharmony_ci work = list_entry(wb->work_list.next, 19598c2ecf20Sopenharmony_ci struct wb_writeback_work, list); 19608c2ecf20Sopenharmony_ci list_del_init(&work->list); 19618c2ecf20Sopenharmony_ci } 19628c2ecf20Sopenharmony_ci spin_unlock_bh(&wb->work_lock); 19638c2ecf20Sopenharmony_ci return work; 19648c2ecf20Sopenharmony_ci} 19658c2ecf20Sopenharmony_ci 19668c2ecf20Sopenharmony_cistatic long wb_check_background_flush(struct bdi_writeback *wb) 19678c2ecf20Sopenharmony_ci{ 19688c2ecf20Sopenharmony_ci if (wb_over_bg_thresh(wb)) { 19698c2ecf20Sopenharmony_ci 19708c2ecf20Sopenharmony_ci struct wb_writeback_work work = { 19718c2ecf20Sopenharmony_ci .nr_pages = LONG_MAX, 19728c2ecf20Sopenharmony_ci .sync_mode = WB_SYNC_NONE, 19738c2ecf20Sopenharmony_ci .for_background = 1, 19748c2ecf20Sopenharmony_ci .range_cyclic = 1, 19758c2ecf20Sopenharmony_ci .reason = WB_REASON_BACKGROUND, 19768c2ecf20Sopenharmony_ci }; 19778c2ecf20Sopenharmony_ci 19788c2ecf20Sopenharmony_ci return wb_writeback(wb, &work); 19798c2ecf20Sopenharmony_ci } 19808c2ecf20Sopenharmony_ci 19818c2ecf20Sopenharmony_ci return 0; 19828c2ecf20Sopenharmony_ci} 19838c2ecf20Sopenharmony_ci 19848c2ecf20Sopenharmony_cistatic long wb_check_old_data_flush(struct bdi_writeback *wb) 19858c2ecf20Sopenharmony_ci{ 19868c2ecf20Sopenharmony_ci unsigned long expired; 19878c2ecf20Sopenharmony_ci long nr_pages; 19888c2ecf20Sopenharmony_ci 19898c2ecf20Sopenharmony_ci /* 19908c2ecf20Sopenharmony_ci * When set to zero, disable periodic writeback 19918c2ecf20Sopenharmony_ci */ 19928c2ecf20Sopenharmony_ci if (!dirty_writeback_interval) 19938c2ecf20Sopenharmony_ci return 0; 19948c2ecf20Sopenharmony_ci 19958c2ecf20Sopenharmony_ci expired = wb->last_old_flush + 19968c2ecf20Sopenharmony_ci msecs_to_jiffies(dirty_writeback_interval * 10); 19978c2ecf20Sopenharmony_ci if (time_before(jiffies, expired)) 19988c2ecf20Sopenharmony_ci return 0; 19998c2ecf20Sopenharmony_ci 20008c2ecf20Sopenharmony_ci wb->last_old_flush = jiffies; 20018c2ecf20Sopenharmony_ci nr_pages = get_nr_dirty_pages(); 20028c2ecf20Sopenharmony_ci 20038c2ecf20Sopenharmony_ci if (nr_pages) { 20048c2ecf20Sopenharmony_ci struct wb_writeback_work work = { 20058c2ecf20Sopenharmony_ci .nr_pages = nr_pages, 20068c2ecf20Sopenharmony_ci .sync_mode = WB_SYNC_NONE, 20078c2ecf20Sopenharmony_ci .for_kupdate = 1, 20088c2ecf20Sopenharmony_ci .range_cyclic = 1, 20098c2ecf20Sopenharmony_ci .reason = WB_REASON_PERIODIC, 20108c2ecf20Sopenharmony_ci }; 20118c2ecf20Sopenharmony_ci 20128c2ecf20Sopenharmony_ci return wb_writeback(wb, &work); 20138c2ecf20Sopenharmony_ci } 20148c2ecf20Sopenharmony_ci 20158c2ecf20Sopenharmony_ci return 0; 20168c2ecf20Sopenharmony_ci} 20178c2ecf20Sopenharmony_ci 20188c2ecf20Sopenharmony_cistatic long wb_check_start_all(struct bdi_writeback *wb) 20198c2ecf20Sopenharmony_ci{ 20208c2ecf20Sopenharmony_ci long nr_pages; 20218c2ecf20Sopenharmony_ci 20228c2ecf20Sopenharmony_ci if (!test_bit(WB_start_all, &wb->state)) 20238c2ecf20Sopenharmony_ci return 0; 20248c2ecf20Sopenharmony_ci 20258c2ecf20Sopenharmony_ci nr_pages = get_nr_dirty_pages(); 20268c2ecf20Sopenharmony_ci if (nr_pages) { 20278c2ecf20Sopenharmony_ci struct wb_writeback_work work = { 20288c2ecf20Sopenharmony_ci .nr_pages = wb_split_bdi_pages(wb, nr_pages), 20298c2ecf20Sopenharmony_ci .sync_mode = WB_SYNC_NONE, 20308c2ecf20Sopenharmony_ci .range_cyclic = 1, 20318c2ecf20Sopenharmony_ci .reason = wb->start_all_reason, 20328c2ecf20Sopenharmony_ci }; 20338c2ecf20Sopenharmony_ci 20348c2ecf20Sopenharmony_ci nr_pages = wb_writeback(wb, &work); 20358c2ecf20Sopenharmony_ci } 20368c2ecf20Sopenharmony_ci 20378c2ecf20Sopenharmony_ci clear_bit(WB_start_all, &wb->state); 20388c2ecf20Sopenharmony_ci return nr_pages; 20398c2ecf20Sopenharmony_ci} 20408c2ecf20Sopenharmony_ci 20418c2ecf20Sopenharmony_ci 20428c2ecf20Sopenharmony_ci/* 20438c2ecf20Sopenharmony_ci * Retrieve work items and do the writeback they describe 20448c2ecf20Sopenharmony_ci */ 20458c2ecf20Sopenharmony_cistatic long wb_do_writeback(struct bdi_writeback *wb) 20468c2ecf20Sopenharmony_ci{ 20478c2ecf20Sopenharmony_ci struct wb_writeback_work *work; 20488c2ecf20Sopenharmony_ci long wrote = 0; 20498c2ecf20Sopenharmony_ci 20508c2ecf20Sopenharmony_ci set_bit(WB_writeback_running, &wb->state); 20518c2ecf20Sopenharmony_ci while ((work = get_next_work_item(wb)) != NULL) { 20528c2ecf20Sopenharmony_ci trace_writeback_exec(wb, work); 20538c2ecf20Sopenharmony_ci wrote += wb_writeback(wb, work); 20548c2ecf20Sopenharmony_ci finish_writeback_work(wb, work); 20558c2ecf20Sopenharmony_ci } 20568c2ecf20Sopenharmony_ci 20578c2ecf20Sopenharmony_ci /* 20588c2ecf20Sopenharmony_ci * Check for a flush-everything request 20598c2ecf20Sopenharmony_ci */ 20608c2ecf20Sopenharmony_ci wrote += wb_check_start_all(wb); 20618c2ecf20Sopenharmony_ci 20628c2ecf20Sopenharmony_ci /* 20638c2ecf20Sopenharmony_ci * Check for periodic writeback, kupdated() style 20648c2ecf20Sopenharmony_ci */ 20658c2ecf20Sopenharmony_ci wrote += wb_check_old_data_flush(wb); 20668c2ecf20Sopenharmony_ci wrote += wb_check_background_flush(wb); 20678c2ecf20Sopenharmony_ci clear_bit(WB_writeback_running, &wb->state); 20688c2ecf20Sopenharmony_ci 20698c2ecf20Sopenharmony_ci return wrote; 20708c2ecf20Sopenharmony_ci} 20718c2ecf20Sopenharmony_ci 20728c2ecf20Sopenharmony_ci/* 20738c2ecf20Sopenharmony_ci * Handle writeback of dirty data for the device backed by this bdi. Also 20748c2ecf20Sopenharmony_ci * reschedules periodically and does kupdated style flushing. 20758c2ecf20Sopenharmony_ci */ 20768c2ecf20Sopenharmony_civoid wb_workfn(struct work_struct *work) 20778c2ecf20Sopenharmony_ci{ 20788c2ecf20Sopenharmony_ci struct bdi_writeback *wb = container_of(to_delayed_work(work), 20798c2ecf20Sopenharmony_ci struct bdi_writeback, dwork); 20808c2ecf20Sopenharmony_ci long pages_written; 20818c2ecf20Sopenharmony_ci 20828c2ecf20Sopenharmony_ci set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); 20838c2ecf20Sopenharmony_ci current->flags |= PF_SWAPWRITE; 20848c2ecf20Sopenharmony_ci 20858c2ecf20Sopenharmony_ci if (likely(!current_is_workqueue_rescuer() || 20868c2ecf20Sopenharmony_ci !test_bit(WB_registered, &wb->state))) { 20878c2ecf20Sopenharmony_ci /* 20888c2ecf20Sopenharmony_ci * The normal path. Keep writing back @wb until its 20898c2ecf20Sopenharmony_ci * work_list is empty. Note that this path is also taken 20908c2ecf20Sopenharmony_ci * if @wb is shutting down even when we're running off the 20918c2ecf20Sopenharmony_ci * rescuer as work_list needs to be drained. 20928c2ecf20Sopenharmony_ci */ 20938c2ecf20Sopenharmony_ci do { 20948c2ecf20Sopenharmony_ci pages_written = wb_do_writeback(wb); 20958c2ecf20Sopenharmony_ci trace_writeback_pages_written(pages_written); 20968c2ecf20Sopenharmony_ci } while (!list_empty(&wb->work_list)); 20978c2ecf20Sopenharmony_ci } else { 20988c2ecf20Sopenharmony_ci /* 20998c2ecf20Sopenharmony_ci * bdi_wq can't get enough workers and we're running off 21008c2ecf20Sopenharmony_ci * the emergency worker. Don't hog it. Hopefully, 1024 is 21018c2ecf20Sopenharmony_ci * enough for efficient IO. 21028c2ecf20Sopenharmony_ci */ 21038c2ecf20Sopenharmony_ci pages_written = writeback_inodes_wb(wb, 1024, 21048c2ecf20Sopenharmony_ci WB_REASON_FORKER_THREAD); 21058c2ecf20Sopenharmony_ci trace_writeback_pages_written(pages_written); 21068c2ecf20Sopenharmony_ci } 21078c2ecf20Sopenharmony_ci 21088c2ecf20Sopenharmony_ci if (!list_empty(&wb->work_list)) 21098c2ecf20Sopenharmony_ci wb_wakeup(wb); 21108c2ecf20Sopenharmony_ci else if (wb_has_dirty_io(wb) && dirty_writeback_interval) 21118c2ecf20Sopenharmony_ci wb_wakeup_delayed(wb); 21128c2ecf20Sopenharmony_ci 21138c2ecf20Sopenharmony_ci current->flags &= ~PF_SWAPWRITE; 21148c2ecf20Sopenharmony_ci} 21158c2ecf20Sopenharmony_ci 21168c2ecf20Sopenharmony_ci/* 21178c2ecf20Sopenharmony_ci * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero, 21188c2ecf20Sopenharmony_ci * write back the whole world. 21198c2ecf20Sopenharmony_ci */ 21208c2ecf20Sopenharmony_cistatic void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, 21218c2ecf20Sopenharmony_ci enum wb_reason reason) 21228c2ecf20Sopenharmony_ci{ 21238c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 21248c2ecf20Sopenharmony_ci 21258c2ecf20Sopenharmony_ci if (!bdi_has_dirty_io(bdi)) 21268c2ecf20Sopenharmony_ci return; 21278c2ecf20Sopenharmony_ci 21288c2ecf20Sopenharmony_ci list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) 21298c2ecf20Sopenharmony_ci wb_start_writeback(wb, reason); 21308c2ecf20Sopenharmony_ci} 21318c2ecf20Sopenharmony_ci 21328c2ecf20Sopenharmony_civoid wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, 21338c2ecf20Sopenharmony_ci enum wb_reason reason) 21348c2ecf20Sopenharmony_ci{ 21358c2ecf20Sopenharmony_ci rcu_read_lock(); 21368c2ecf20Sopenharmony_ci __wakeup_flusher_threads_bdi(bdi, reason); 21378c2ecf20Sopenharmony_ci rcu_read_unlock(); 21388c2ecf20Sopenharmony_ci} 21398c2ecf20Sopenharmony_ci 21408c2ecf20Sopenharmony_ci/* 21418c2ecf20Sopenharmony_ci * Wakeup the flusher threads to start writeback of all currently dirty pages 21428c2ecf20Sopenharmony_ci */ 21438c2ecf20Sopenharmony_civoid wakeup_flusher_threads(enum wb_reason reason) 21448c2ecf20Sopenharmony_ci{ 21458c2ecf20Sopenharmony_ci struct backing_dev_info *bdi; 21468c2ecf20Sopenharmony_ci 21478c2ecf20Sopenharmony_ci /* 21488c2ecf20Sopenharmony_ci * If we are expecting writeback progress we must submit plugged IO. 21498c2ecf20Sopenharmony_ci */ 21508c2ecf20Sopenharmony_ci if (blk_needs_flush_plug(current)) 21518c2ecf20Sopenharmony_ci blk_schedule_flush_plug(current); 21528c2ecf20Sopenharmony_ci 21538c2ecf20Sopenharmony_ci rcu_read_lock(); 21548c2ecf20Sopenharmony_ci list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) 21558c2ecf20Sopenharmony_ci __wakeup_flusher_threads_bdi(bdi, reason); 21568c2ecf20Sopenharmony_ci rcu_read_unlock(); 21578c2ecf20Sopenharmony_ci} 21588c2ecf20Sopenharmony_ci 21598c2ecf20Sopenharmony_ci/* 21608c2ecf20Sopenharmony_ci * Wake up bdi's periodically to make sure dirtytime inodes gets 21618c2ecf20Sopenharmony_ci * written back periodically. We deliberately do *not* check the 21628c2ecf20Sopenharmony_ci * b_dirtytime list in wb_has_dirty_io(), since this would cause the 21638c2ecf20Sopenharmony_ci * kernel to be constantly waking up once there are any dirtytime 21648c2ecf20Sopenharmony_ci * inodes on the system. So instead we define a separate delayed work 21658c2ecf20Sopenharmony_ci * function which gets called much more rarely. (By default, only 21668c2ecf20Sopenharmony_ci * once every 12 hours.) 21678c2ecf20Sopenharmony_ci * 21688c2ecf20Sopenharmony_ci * If there is any other write activity going on in the file system, 21698c2ecf20Sopenharmony_ci * this function won't be necessary. But if the only thing that has 21708c2ecf20Sopenharmony_ci * happened on the file system is a dirtytime inode caused by an atime 21718c2ecf20Sopenharmony_ci * update, we need this infrastructure below to make sure that inode 21728c2ecf20Sopenharmony_ci * eventually gets pushed out to disk. 21738c2ecf20Sopenharmony_ci */ 21748c2ecf20Sopenharmony_cistatic void wakeup_dirtytime_writeback(struct work_struct *w); 21758c2ecf20Sopenharmony_cistatic DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); 21768c2ecf20Sopenharmony_ci 21778c2ecf20Sopenharmony_cistatic void wakeup_dirtytime_writeback(struct work_struct *w) 21788c2ecf20Sopenharmony_ci{ 21798c2ecf20Sopenharmony_ci struct backing_dev_info *bdi; 21808c2ecf20Sopenharmony_ci 21818c2ecf20Sopenharmony_ci rcu_read_lock(); 21828c2ecf20Sopenharmony_ci list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 21838c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 21848c2ecf20Sopenharmony_ci 21858c2ecf20Sopenharmony_ci list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) 21868c2ecf20Sopenharmony_ci if (!list_empty(&wb->b_dirty_time)) 21878c2ecf20Sopenharmony_ci wb_wakeup(wb); 21888c2ecf20Sopenharmony_ci } 21898c2ecf20Sopenharmony_ci rcu_read_unlock(); 21908c2ecf20Sopenharmony_ci schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); 21918c2ecf20Sopenharmony_ci} 21928c2ecf20Sopenharmony_ci 21938c2ecf20Sopenharmony_cistatic int __init start_dirtytime_writeback(void) 21948c2ecf20Sopenharmony_ci{ 21958c2ecf20Sopenharmony_ci schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); 21968c2ecf20Sopenharmony_ci return 0; 21978c2ecf20Sopenharmony_ci} 21988c2ecf20Sopenharmony_ci__initcall(start_dirtytime_writeback); 21998c2ecf20Sopenharmony_ci 22008c2ecf20Sopenharmony_ciint dirtytime_interval_handler(struct ctl_table *table, int write, 22018c2ecf20Sopenharmony_ci void *buffer, size_t *lenp, loff_t *ppos) 22028c2ecf20Sopenharmony_ci{ 22038c2ecf20Sopenharmony_ci int ret; 22048c2ecf20Sopenharmony_ci 22058c2ecf20Sopenharmony_ci ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 22068c2ecf20Sopenharmony_ci if (ret == 0 && write) 22078c2ecf20Sopenharmony_ci mod_delayed_work(system_wq, &dirtytime_work, 0); 22088c2ecf20Sopenharmony_ci return ret; 22098c2ecf20Sopenharmony_ci} 22108c2ecf20Sopenharmony_ci 22118c2ecf20Sopenharmony_ci/** 22128c2ecf20Sopenharmony_ci * __mark_inode_dirty - internal function 22138c2ecf20Sopenharmony_ci * 22148c2ecf20Sopenharmony_ci * @inode: inode to mark 22158c2ecf20Sopenharmony_ci * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) 22168c2ecf20Sopenharmony_ci * 22178c2ecf20Sopenharmony_ci * Mark an inode as dirty. Callers should use mark_inode_dirty or 22188c2ecf20Sopenharmony_ci * mark_inode_dirty_sync. 22198c2ecf20Sopenharmony_ci * 22208c2ecf20Sopenharmony_ci * Put the inode on the super block's dirty list. 22218c2ecf20Sopenharmony_ci * 22228c2ecf20Sopenharmony_ci * CAREFUL! We mark it dirty unconditionally, but move it onto the 22238c2ecf20Sopenharmony_ci * dirty list only if it is hashed or if it refers to a blockdev. 22248c2ecf20Sopenharmony_ci * If it was not hashed, it will never be added to the dirty list 22258c2ecf20Sopenharmony_ci * even if it is later hashed, as it will have been marked dirty already. 22268c2ecf20Sopenharmony_ci * 22278c2ecf20Sopenharmony_ci * In short, make sure you hash any inodes _before_ you start marking 22288c2ecf20Sopenharmony_ci * them dirty. 22298c2ecf20Sopenharmony_ci * 22308c2ecf20Sopenharmony_ci * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 22318c2ecf20Sopenharmony_ci * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 22328c2ecf20Sopenharmony_ci * the kernel-internal blockdev inode represents the dirtying time of the 22338c2ecf20Sopenharmony_ci * blockdev's pages. This is why for I_DIRTY_PAGES we always use 22348c2ecf20Sopenharmony_ci * page->mapping->host, so the page-dirtying time is recorded in the internal 22358c2ecf20Sopenharmony_ci * blockdev inode. 22368c2ecf20Sopenharmony_ci */ 22378c2ecf20Sopenharmony_civoid __mark_inode_dirty(struct inode *inode, int flags) 22388c2ecf20Sopenharmony_ci{ 22398c2ecf20Sopenharmony_ci struct super_block *sb = inode->i_sb; 22408c2ecf20Sopenharmony_ci int dirtytime; 22418c2ecf20Sopenharmony_ci 22428c2ecf20Sopenharmony_ci trace_writeback_mark_inode_dirty(inode, flags); 22438c2ecf20Sopenharmony_ci 22448c2ecf20Sopenharmony_ci /* 22458c2ecf20Sopenharmony_ci * Don't do this for I_DIRTY_PAGES - that doesn't actually 22468c2ecf20Sopenharmony_ci * dirty the inode itself 22478c2ecf20Sopenharmony_ci */ 22488c2ecf20Sopenharmony_ci if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) { 22498c2ecf20Sopenharmony_ci trace_writeback_dirty_inode_start(inode, flags); 22508c2ecf20Sopenharmony_ci 22518c2ecf20Sopenharmony_ci if (sb->s_op->dirty_inode) 22528c2ecf20Sopenharmony_ci sb->s_op->dirty_inode(inode, flags); 22538c2ecf20Sopenharmony_ci 22548c2ecf20Sopenharmony_ci trace_writeback_dirty_inode(inode, flags); 22558c2ecf20Sopenharmony_ci } 22568c2ecf20Sopenharmony_ci if (flags & I_DIRTY_INODE) 22578c2ecf20Sopenharmony_ci flags &= ~I_DIRTY_TIME; 22588c2ecf20Sopenharmony_ci dirtytime = flags & I_DIRTY_TIME; 22598c2ecf20Sopenharmony_ci 22608c2ecf20Sopenharmony_ci /* 22618c2ecf20Sopenharmony_ci * Paired with smp_mb() in __writeback_single_inode() for the 22628c2ecf20Sopenharmony_ci * following lockless i_state test. See there for details. 22638c2ecf20Sopenharmony_ci */ 22648c2ecf20Sopenharmony_ci smp_mb(); 22658c2ecf20Sopenharmony_ci 22668c2ecf20Sopenharmony_ci if (((inode->i_state & flags) == flags) || 22678c2ecf20Sopenharmony_ci (dirtytime && (inode->i_state & I_DIRTY_INODE))) 22688c2ecf20Sopenharmony_ci return; 22698c2ecf20Sopenharmony_ci 22708c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 22718c2ecf20Sopenharmony_ci if (dirtytime && (inode->i_state & I_DIRTY_INODE)) 22728c2ecf20Sopenharmony_ci goto out_unlock_inode; 22738c2ecf20Sopenharmony_ci if ((inode->i_state & flags) != flags) { 22748c2ecf20Sopenharmony_ci const int was_dirty = inode->i_state & I_DIRTY; 22758c2ecf20Sopenharmony_ci 22768c2ecf20Sopenharmony_ci inode_attach_wb(inode, NULL); 22778c2ecf20Sopenharmony_ci 22788c2ecf20Sopenharmony_ci if (flags & I_DIRTY_INODE) 22798c2ecf20Sopenharmony_ci inode->i_state &= ~I_DIRTY_TIME; 22808c2ecf20Sopenharmony_ci inode->i_state |= flags; 22818c2ecf20Sopenharmony_ci 22828c2ecf20Sopenharmony_ci /* 22838c2ecf20Sopenharmony_ci * If the inode is queued for writeback by flush worker, just 22848c2ecf20Sopenharmony_ci * update its dirty state. Once the flush worker is done with 22858c2ecf20Sopenharmony_ci * the inode it will place it on the appropriate superblock 22868c2ecf20Sopenharmony_ci * list, based upon its state. 22878c2ecf20Sopenharmony_ci */ 22888c2ecf20Sopenharmony_ci if (inode->i_state & I_SYNC_QUEUED) 22898c2ecf20Sopenharmony_ci goto out_unlock_inode; 22908c2ecf20Sopenharmony_ci 22918c2ecf20Sopenharmony_ci /* 22928c2ecf20Sopenharmony_ci * Only add valid (hashed) inodes to the superblock's 22938c2ecf20Sopenharmony_ci * dirty list. Add blockdev inodes as well. 22948c2ecf20Sopenharmony_ci */ 22958c2ecf20Sopenharmony_ci if (!S_ISBLK(inode->i_mode)) { 22968c2ecf20Sopenharmony_ci if (inode_unhashed(inode)) 22978c2ecf20Sopenharmony_ci goto out_unlock_inode; 22988c2ecf20Sopenharmony_ci } 22998c2ecf20Sopenharmony_ci if (inode->i_state & I_FREEING) 23008c2ecf20Sopenharmony_ci goto out_unlock_inode; 23018c2ecf20Sopenharmony_ci 23028c2ecf20Sopenharmony_ci /* 23038c2ecf20Sopenharmony_ci * If the inode was already on b_dirty/b_io/b_more_io, don't 23048c2ecf20Sopenharmony_ci * reposition it (that would break b_dirty time-ordering). 23058c2ecf20Sopenharmony_ci */ 23068c2ecf20Sopenharmony_ci if (!was_dirty) { 23078c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 23088c2ecf20Sopenharmony_ci struct list_head *dirty_list; 23098c2ecf20Sopenharmony_ci bool wakeup_bdi = false; 23108c2ecf20Sopenharmony_ci 23118c2ecf20Sopenharmony_ci wb = locked_inode_to_wb_and_lock_list(inode); 23128c2ecf20Sopenharmony_ci 23138c2ecf20Sopenharmony_ci inode->dirtied_when = jiffies; 23148c2ecf20Sopenharmony_ci if (dirtytime) 23158c2ecf20Sopenharmony_ci inode->dirtied_time_when = jiffies; 23168c2ecf20Sopenharmony_ci 23178c2ecf20Sopenharmony_ci if (inode->i_state & I_DIRTY) 23188c2ecf20Sopenharmony_ci dirty_list = &wb->b_dirty; 23198c2ecf20Sopenharmony_ci else 23208c2ecf20Sopenharmony_ci dirty_list = &wb->b_dirty_time; 23218c2ecf20Sopenharmony_ci 23228c2ecf20Sopenharmony_ci wakeup_bdi = inode_io_list_move_locked(inode, wb, 23238c2ecf20Sopenharmony_ci dirty_list); 23248c2ecf20Sopenharmony_ci 23258c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 23268c2ecf20Sopenharmony_ci trace_writeback_dirty_inode_enqueue(inode); 23278c2ecf20Sopenharmony_ci 23288c2ecf20Sopenharmony_ci /* 23298c2ecf20Sopenharmony_ci * If this is the first dirty inode for this bdi, 23308c2ecf20Sopenharmony_ci * we have to wake-up the corresponding bdi thread 23318c2ecf20Sopenharmony_ci * to make sure background write-back happens 23328c2ecf20Sopenharmony_ci * later. 23338c2ecf20Sopenharmony_ci */ 23348c2ecf20Sopenharmony_ci if (wakeup_bdi && 23358c2ecf20Sopenharmony_ci (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) 23368c2ecf20Sopenharmony_ci wb_wakeup_delayed(wb); 23378c2ecf20Sopenharmony_ci return; 23388c2ecf20Sopenharmony_ci } 23398c2ecf20Sopenharmony_ci } 23408c2ecf20Sopenharmony_ciout_unlock_inode: 23418c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 23428c2ecf20Sopenharmony_ci} 23438c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__mark_inode_dirty); 23448c2ecf20Sopenharmony_ci 23458c2ecf20Sopenharmony_ci/* 23468c2ecf20Sopenharmony_ci * The @s_sync_lock is used to serialise concurrent sync operations 23478c2ecf20Sopenharmony_ci * to avoid lock contention problems with concurrent wait_sb_inodes() calls. 23488c2ecf20Sopenharmony_ci * Concurrent callers will block on the s_sync_lock rather than doing contending 23498c2ecf20Sopenharmony_ci * walks. The queueing maintains sync(2) required behaviour as all the IO that 23508c2ecf20Sopenharmony_ci * has been issued up to the time this function is enter is guaranteed to be 23518c2ecf20Sopenharmony_ci * completed by the time we have gained the lock and waited for all IO that is 23528c2ecf20Sopenharmony_ci * in progress regardless of the order callers are granted the lock. 23538c2ecf20Sopenharmony_ci */ 23548c2ecf20Sopenharmony_cistatic void wait_sb_inodes(struct super_block *sb) 23558c2ecf20Sopenharmony_ci{ 23568c2ecf20Sopenharmony_ci LIST_HEAD(sync_list); 23578c2ecf20Sopenharmony_ci 23588c2ecf20Sopenharmony_ci /* 23598c2ecf20Sopenharmony_ci * We need to be protected against the filesystem going from 23608c2ecf20Sopenharmony_ci * r/o to r/w or vice versa. 23618c2ecf20Sopenharmony_ci */ 23628c2ecf20Sopenharmony_ci WARN_ON(!rwsem_is_locked(&sb->s_umount)); 23638c2ecf20Sopenharmony_ci 23648c2ecf20Sopenharmony_ci mutex_lock(&sb->s_sync_lock); 23658c2ecf20Sopenharmony_ci 23668c2ecf20Sopenharmony_ci /* 23678c2ecf20Sopenharmony_ci * Splice the writeback list onto a temporary list to avoid waiting on 23688c2ecf20Sopenharmony_ci * inodes that have started writeback after this point. 23698c2ecf20Sopenharmony_ci * 23708c2ecf20Sopenharmony_ci * Use rcu_read_lock() to keep the inodes around until we have a 23718c2ecf20Sopenharmony_ci * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as 23728c2ecf20Sopenharmony_ci * the local list because inodes can be dropped from either by writeback 23738c2ecf20Sopenharmony_ci * completion. 23748c2ecf20Sopenharmony_ci */ 23758c2ecf20Sopenharmony_ci rcu_read_lock(); 23768c2ecf20Sopenharmony_ci spin_lock_irq(&sb->s_inode_wblist_lock); 23778c2ecf20Sopenharmony_ci list_splice_init(&sb->s_inodes_wb, &sync_list); 23788c2ecf20Sopenharmony_ci 23798c2ecf20Sopenharmony_ci /* 23808c2ecf20Sopenharmony_ci * Data integrity sync. Must wait for all pages under writeback, because 23818c2ecf20Sopenharmony_ci * there may have been pages dirtied before our sync call, but which had 23828c2ecf20Sopenharmony_ci * writeout started before we write it out. In which case, the inode 23838c2ecf20Sopenharmony_ci * may not be on the dirty list, but we still have to wait for that 23848c2ecf20Sopenharmony_ci * writeout. 23858c2ecf20Sopenharmony_ci */ 23868c2ecf20Sopenharmony_ci while (!list_empty(&sync_list)) { 23878c2ecf20Sopenharmony_ci struct inode *inode = list_first_entry(&sync_list, struct inode, 23888c2ecf20Sopenharmony_ci i_wb_list); 23898c2ecf20Sopenharmony_ci struct address_space *mapping = inode->i_mapping; 23908c2ecf20Sopenharmony_ci 23918c2ecf20Sopenharmony_ci /* 23928c2ecf20Sopenharmony_ci * Move each inode back to the wb list before we drop the lock 23938c2ecf20Sopenharmony_ci * to preserve consistency between i_wb_list and the mapping 23948c2ecf20Sopenharmony_ci * writeback tag. Writeback completion is responsible to remove 23958c2ecf20Sopenharmony_ci * the inode from either list once the writeback tag is cleared. 23968c2ecf20Sopenharmony_ci */ 23978c2ecf20Sopenharmony_ci list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); 23988c2ecf20Sopenharmony_ci 23998c2ecf20Sopenharmony_ci /* 24008c2ecf20Sopenharmony_ci * The mapping can appear untagged while still on-list since we 24018c2ecf20Sopenharmony_ci * do not have the mapping lock. Skip it here, wb completion 24028c2ecf20Sopenharmony_ci * will remove it. 24038c2ecf20Sopenharmony_ci */ 24048c2ecf20Sopenharmony_ci if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 24058c2ecf20Sopenharmony_ci continue; 24068c2ecf20Sopenharmony_ci 24078c2ecf20Sopenharmony_ci spin_unlock_irq(&sb->s_inode_wblist_lock); 24088c2ecf20Sopenharmony_ci 24098c2ecf20Sopenharmony_ci spin_lock(&inode->i_lock); 24108c2ecf20Sopenharmony_ci if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { 24118c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 24128c2ecf20Sopenharmony_ci 24138c2ecf20Sopenharmony_ci spin_lock_irq(&sb->s_inode_wblist_lock); 24148c2ecf20Sopenharmony_ci continue; 24158c2ecf20Sopenharmony_ci } 24168c2ecf20Sopenharmony_ci __iget(inode); 24178c2ecf20Sopenharmony_ci spin_unlock(&inode->i_lock); 24188c2ecf20Sopenharmony_ci rcu_read_unlock(); 24198c2ecf20Sopenharmony_ci 24208c2ecf20Sopenharmony_ci /* 24218c2ecf20Sopenharmony_ci * We keep the error status of individual mapping so that 24228c2ecf20Sopenharmony_ci * applications can catch the writeback error using fsync(2). 24238c2ecf20Sopenharmony_ci * See filemap_fdatawait_keep_errors() for details. 24248c2ecf20Sopenharmony_ci */ 24258c2ecf20Sopenharmony_ci filemap_fdatawait_keep_errors(mapping); 24268c2ecf20Sopenharmony_ci 24278c2ecf20Sopenharmony_ci cond_resched(); 24288c2ecf20Sopenharmony_ci 24298c2ecf20Sopenharmony_ci iput(inode); 24308c2ecf20Sopenharmony_ci 24318c2ecf20Sopenharmony_ci rcu_read_lock(); 24328c2ecf20Sopenharmony_ci spin_lock_irq(&sb->s_inode_wblist_lock); 24338c2ecf20Sopenharmony_ci } 24348c2ecf20Sopenharmony_ci spin_unlock_irq(&sb->s_inode_wblist_lock); 24358c2ecf20Sopenharmony_ci rcu_read_unlock(); 24368c2ecf20Sopenharmony_ci mutex_unlock(&sb->s_sync_lock); 24378c2ecf20Sopenharmony_ci} 24388c2ecf20Sopenharmony_ci 24398c2ecf20Sopenharmony_cistatic void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, 24408c2ecf20Sopenharmony_ci enum wb_reason reason, bool skip_if_busy) 24418c2ecf20Sopenharmony_ci{ 24428c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = sb->s_bdi; 24438c2ecf20Sopenharmony_ci DEFINE_WB_COMPLETION(done, bdi); 24448c2ecf20Sopenharmony_ci struct wb_writeback_work work = { 24458c2ecf20Sopenharmony_ci .sb = sb, 24468c2ecf20Sopenharmony_ci .sync_mode = WB_SYNC_NONE, 24478c2ecf20Sopenharmony_ci .tagged_writepages = 1, 24488c2ecf20Sopenharmony_ci .done = &done, 24498c2ecf20Sopenharmony_ci .nr_pages = nr, 24508c2ecf20Sopenharmony_ci .reason = reason, 24518c2ecf20Sopenharmony_ci }; 24528c2ecf20Sopenharmony_ci 24538c2ecf20Sopenharmony_ci if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) 24548c2ecf20Sopenharmony_ci return; 24558c2ecf20Sopenharmony_ci WARN_ON(!rwsem_is_locked(&sb->s_umount)); 24568c2ecf20Sopenharmony_ci 24578c2ecf20Sopenharmony_ci bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); 24588c2ecf20Sopenharmony_ci wb_wait_for_completion(&done); 24598c2ecf20Sopenharmony_ci} 24608c2ecf20Sopenharmony_ci 24618c2ecf20Sopenharmony_ci/** 24628c2ecf20Sopenharmony_ci * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 24638c2ecf20Sopenharmony_ci * @sb: the superblock 24648c2ecf20Sopenharmony_ci * @nr: the number of pages to write 24658c2ecf20Sopenharmony_ci * @reason: reason why some writeback work initiated 24668c2ecf20Sopenharmony_ci * 24678c2ecf20Sopenharmony_ci * Start writeback on some inodes on this super_block. No guarantees are made 24688c2ecf20Sopenharmony_ci * on how many (if any) will be written, and this function does not wait 24698c2ecf20Sopenharmony_ci * for IO completion of submitted IO. 24708c2ecf20Sopenharmony_ci */ 24718c2ecf20Sopenharmony_civoid writeback_inodes_sb_nr(struct super_block *sb, 24728c2ecf20Sopenharmony_ci unsigned long nr, 24738c2ecf20Sopenharmony_ci enum wb_reason reason) 24748c2ecf20Sopenharmony_ci{ 24758c2ecf20Sopenharmony_ci __writeback_inodes_sb_nr(sb, nr, reason, false); 24768c2ecf20Sopenharmony_ci} 24778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(writeback_inodes_sb_nr); 24788c2ecf20Sopenharmony_ci 24798c2ecf20Sopenharmony_ci/** 24808c2ecf20Sopenharmony_ci * writeback_inodes_sb - writeback dirty inodes from given super_block 24818c2ecf20Sopenharmony_ci * @sb: the superblock 24828c2ecf20Sopenharmony_ci * @reason: reason why some writeback work was initiated 24838c2ecf20Sopenharmony_ci * 24848c2ecf20Sopenharmony_ci * Start writeback on some inodes on this super_block. No guarantees are made 24858c2ecf20Sopenharmony_ci * on how many (if any) will be written, and this function does not wait 24868c2ecf20Sopenharmony_ci * for IO completion of submitted IO. 24878c2ecf20Sopenharmony_ci */ 24888c2ecf20Sopenharmony_civoid writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) 24898c2ecf20Sopenharmony_ci{ 24908c2ecf20Sopenharmony_ci return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); 24918c2ecf20Sopenharmony_ci} 24928c2ecf20Sopenharmony_ciEXPORT_SYMBOL(writeback_inodes_sb); 24938c2ecf20Sopenharmony_ci 24948c2ecf20Sopenharmony_ci/** 24958c2ecf20Sopenharmony_ci * try_to_writeback_inodes_sb - try to start writeback if none underway 24968c2ecf20Sopenharmony_ci * @sb: the superblock 24978c2ecf20Sopenharmony_ci * @reason: reason why some writeback work was initiated 24988c2ecf20Sopenharmony_ci * 24998c2ecf20Sopenharmony_ci * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. 25008c2ecf20Sopenharmony_ci */ 25018c2ecf20Sopenharmony_civoid try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) 25028c2ecf20Sopenharmony_ci{ 25038c2ecf20Sopenharmony_ci if (!down_read_trylock(&sb->s_umount)) 25048c2ecf20Sopenharmony_ci return; 25058c2ecf20Sopenharmony_ci 25068c2ecf20Sopenharmony_ci __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); 25078c2ecf20Sopenharmony_ci up_read(&sb->s_umount); 25088c2ecf20Sopenharmony_ci} 25098c2ecf20Sopenharmony_ciEXPORT_SYMBOL(try_to_writeback_inodes_sb); 25108c2ecf20Sopenharmony_ci 25118c2ecf20Sopenharmony_ci/** 25128c2ecf20Sopenharmony_ci * sync_inodes_sb - sync sb inode pages 25138c2ecf20Sopenharmony_ci * @sb: the superblock 25148c2ecf20Sopenharmony_ci * 25158c2ecf20Sopenharmony_ci * This function writes and waits on any dirty inode belonging to this 25168c2ecf20Sopenharmony_ci * super_block. 25178c2ecf20Sopenharmony_ci */ 25188c2ecf20Sopenharmony_civoid sync_inodes_sb(struct super_block *sb) 25198c2ecf20Sopenharmony_ci{ 25208c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = sb->s_bdi; 25218c2ecf20Sopenharmony_ci DEFINE_WB_COMPLETION(done, bdi); 25228c2ecf20Sopenharmony_ci struct wb_writeback_work work = { 25238c2ecf20Sopenharmony_ci .sb = sb, 25248c2ecf20Sopenharmony_ci .sync_mode = WB_SYNC_ALL, 25258c2ecf20Sopenharmony_ci .nr_pages = LONG_MAX, 25268c2ecf20Sopenharmony_ci .range_cyclic = 0, 25278c2ecf20Sopenharmony_ci .done = &done, 25288c2ecf20Sopenharmony_ci .reason = WB_REASON_SYNC, 25298c2ecf20Sopenharmony_ci .for_sync = 1, 25308c2ecf20Sopenharmony_ci }; 25318c2ecf20Sopenharmony_ci 25328c2ecf20Sopenharmony_ci /* 25338c2ecf20Sopenharmony_ci * Can't skip on !bdi_has_dirty() because we should wait for !dirty 25348c2ecf20Sopenharmony_ci * inodes under writeback and I_DIRTY_TIME inodes ignored by 25358c2ecf20Sopenharmony_ci * bdi_has_dirty() need to be written out too. 25368c2ecf20Sopenharmony_ci */ 25378c2ecf20Sopenharmony_ci if (bdi == &noop_backing_dev_info) 25388c2ecf20Sopenharmony_ci return; 25398c2ecf20Sopenharmony_ci WARN_ON(!rwsem_is_locked(&sb->s_umount)); 25408c2ecf20Sopenharmony_ci 25418c2ecf20Sopenharmony_ci /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ 25428c2ecf20Sopenharmony_ci bdi_down_write_wb_switch_rwsem(bdi); 25438c2ecf20Sopenharmony_ci bdi_split_work_to_wbs(bdi, &work, false); 25448c2ecf20Sopenharmony_ci wb_wait_for_completion(&done); 25458c2ecf20Sopenharmony_ci bdi_up_write_wb_switch_rwsem(bdi); 25468c2ecf20Sopenharmony_ci 25478c2ecf20Sopenharmony_ci wait_sb_inodes(sb); 25488c2ecf20Sopenharmony_ci} 25498c2ecf20Sopenharmony_ciEXPORT_SYMBOL(sync_inodes_sb); 25508c2ecf20Sopenharmony_ci 25518c2ecf20Sopenharmony_ci/** 25528c2ecf20Sopenharmony_ci * write_inode_now - write an inode to disk 25538c2ecf20Sopenharmony_ci * @inode: inode to write to disk 25548c2ecf20Sopenharmony_ci * @sync: whether the write should be synchronous or not 25558c2ecf20Sopenharmony_ci * 25568c2ecf20Sopenharmony_ci * This function commits an inode to disk immediately if it is dirty. This is 25578c2ecf20Sopenharmony_ci * primarily needed by knfsd. 25588c2ecf20Sopenharmony_ci * 25598c2ecf20Sopenharmony_ci * The caller must either have a ref on the inode or must have set I_WILL_FREE. 25608c2ecf20Sopenharmony_ci */ 25618c2ecf20Sopenharmony_ciint write_inode_now(struct inode *inode, int sync) 25628c2ecf20Sopenharmony_ci{ 25638c2ecf20Sopenharmony_ci struct writeback_control wbc = { 25648c2ecf20Sopenharmony_ci .nr_to_write = LONG_MAX, 25658c2ecf20Sopenharmony_ci .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, 25668c2ecf20Sopenharmony_ci .range_start = 0, 25678c2ecf20Sopenharmony_ci .range_end = LLONG_MAX, 25688c2ecf20Sopenharmony_ci }; 25698c2ecf20Sopenharmony_ci 25708c2ecf20Sopenharmony_ci if (!mapping_can_writeback(inode->i_mapping)) 25718c2ecf20Sopenharmony_ci wbc.nr_to_write = 0; 25728c2ecf20Sopenharmony_ci 25738c2ecf20Sopenharmony_ci might_sleep(); 25748c2ecf20Sopenharmony_ci return writeback_single_inode(inode, &wbc); 25758c2ecf20Sopenharmony_ci} 25768c2ecf20Sopenharmony_ciEXPORT_SYMBOL(write_inode_now); 25778c2ecf20Sopenharmony_ci 25788c2ecf20Sopenharmony_ci/** 25798c2ecf20Sopenharmony_ci * sync_inode - write an inode and its pages to disk. 25808c2ecf20Sopenharmony_ci * @inode: the inode to sync 25818c2ecf20Sopenharmony_ci * @wbc: controls the writeback mode 25828c2ecf20Sopenharmony_ci * 25838c2ecf20Sopenharmony_ci * sync_inode() will write an inode and its pages to disk. It will also 25848c2ecf20Sopenharmony_ci * correctly update the inode on its superblock's dirty inode lists and will 25858c2ecf20Sopenharmony_ci * update inode->i_state. 25868c2ecf20Sopenharmony_ci * 25878c2ecf20Sopenharmony_ci * The caller must have a ref on the inode. 25888c2ecf20Sopenharmony_ci */ 25898c2ecf20Sopenharmony_ciint sync_inode(struct inode *inode, struct writeback_control *wbc) 25908c2ecf20Sopenharmony_ci{ 25918c2ecf20Sopenharmony_ci return writeback_single_inode(inode, wbc); 25928c2ecf20Sopenharmony_ci} 25938c2ecf20Sopenharmony_ciEXPORT_SYMBOL(sync_inode); 25948c2ecf20Sopenharmony_ci 25958c2ecf20Sopenharmony_ci/** 25968c2ecf20Sopenharmony_ci * sync_inode_metadata - write an inode to disk 25978c2ecf20Sopenharmony_ci * @inode: the inode to sync 25988c2ecf20Sopenharmony_ci * @wait: wait for I/O to complete. 25998c2ecf20Sopenharmony_ci * 26008c2ecf20Sopenharmony_ci * Write an inode to disk and adjust its dirty state after completion. 26018c2ecf20Sopenharmony_ci * 26028c2ecf20Sopenharmony_ci * Note: only writes the actual inode, no associated data or other metadata. 26038c2ecf20Sopenharmony_ci */ 26048c2ecf20Sopenharmony_ciint sync_inode_metadata(struct inode *inode, int wait) 26058c2ecf20Sopenharmony_ci{ 26068c2ecf20Sopenharmony_ci struct writeback_control wbc = { 26078c2ecf20Sopenharmony_ci .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 26088c2ecf20Sopenharmony_ci .nr_to_write = 0, /* metadata-only */ 26098c2ecf20Sopenharmony_ci }; 26108c2ecf20Sopenharmony_ci 26118c2ecf20Sopenharmony_ci return sync_inode(inode, &wbc); 26128c2ecf20Sopenharmony_ci} 26138c2ecf20Sopenharmony_ciEXPORT_SYMBOL(sync_inode_metadata); 2614