18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * mm/page-writeback.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2002, Linus Torvalds. 68c2ecf20Sopenharmony_ci * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Contains functions related to writing back dirty pages at the 98c2ecf20Sopenharmony_ci * address_space level. 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * 10Apr2002 Andrew Morton 128c2ecf20Sopenharmony_ci * Initial version 138c2ecf20Sopenharmony_ci */ 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci#include <linux/kernel.h> 168c2ecf20Sopenharmony_ci#include <linux/export.h> 178c2ecf20Sopenharmony_ci#include <linux/spinlock.h> 188c2ecf20Sopenharmony_ci#include <linux/fs.h> 198c2ecf20Sopenharmony_ci#include <linux/mm.h> 208c2ecf20Sopenharmony_ci#include <linux/swap.h> 218c2ecf20Sopenharmony_ci#include <linux/slab.h> 228c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 238c2ecf20Sopenharmony_ci#include <linux/writeback.h> 248c2ecf20Sopenharmony_ci#include <linux/init.h> 258c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 268c2ecf20Sopenharmony_ci#include <linux/task_io_accounting_ops.h> 278c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 288c2ecf20Sopenharmony_ci#include <linux/mpage.h> 298c2ecf20Sopenharmony_ci#include <linux/rmap.h> 308c2ecf20Sopenharmony_ci#include <linux/percpu.h> 318c2ecf20Sopenharmony_ci#include <linux/smp.h> 328c2ecf20Sopenharmony_ci#include <linux/sysctl.h> 338c2ecf20Sopenharmony_ci#include <linux/cpu.h> 348c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 358c2ecf20Sopenharmony_ci#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ 368c2ecf20Sopenharmony_ci#include <linux/pagevec.h> 378c2ecf20Sopenharmony_ci#include <linux/timer.h> 388c2ecf20Sopenharmony_ci#include <linux/sched/rt.h> 398c2ecf20Sopenharmony_ci#include <linux/sched/signal.h> 408c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 418c2ecf20Sopenharmony_ci#include <trace/events/writeback.h> 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci#include "internal.h" 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci/* 468c2ecf20Sopenharmony_ci * Sleep at most 200ms at a time in balance_dirty_pages(). 478c2ecf20Sopenharmony_ci */ 488c2ecf20Sopenharmony_ci#define MAX_PAUSE max(HZ/5, 1) 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci/* 518c2ecf20Sopenharmony_ci * Try to keep balance_dirty_pages() call intervals higher than this many pages 528c2ecf20Sopenharmony_ci * by raising pause time to max_pause when falls below it. 538c2ecf20Sopenharmony_ci */ 548c2ecf20Sopenharmony_ci#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci/* 578c2ecf20Sopenharmony_ci * Estimate write bandwidth at 200ms intervals. 588c2ecf20Sopenharmony_ci */ 598c2ecf20Sopenharmony_ci#define BANDWIDTH_INTERVAL max(HZ/5, 1) 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci#define RATELIMIT_CALC_SHIFT 10 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci/* 648c2ecf20Sopenharmony_ci * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 658c2ecf20Sopenharmony_ci * will look to see if it needs to force writeback or throttling. 668c2ecf20Sopenharmony_ci */ 678c2ecf20Sopenharmony_cistatic long ratelimit_pages = 32; 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci/* The following parameters are exported via /proc/sys/vm */ 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci/* 728c2ecf20Sopenharmony_ci * Start background writeback (via writeback threads) at this percentage 738c2ecf20Sopenharmony_ci */ 748c2ecf20Sopenharmony_ciint dirty_background_ratio = 10; 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci/* 778c2ecf20Sopenharmony_ci * dirty_background_bytes starts at 0 (disabled) so that it is a function of 788c2ecf20Sopenharmony_ci * dirty_background_ratio * the amount of dirtyable memory 798c2ecf20Sopenharmony_ci */ 808c2ecf20Sopenharmony_ciunsigned long dirty_background_bytes; 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci/* 838c2ecf20Sopenharmony_ci * free highmem will not be subtracted from the total free memory 848c2ecf20Sopenharmony_ci * for calculating free ratios if vm_highmem_is_dirtyable is true 858c2ecf20Sopenharmony_ci */ 868c2ecf20Sopenharmony_ciint vm_highmem_is_dirtyable; 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci/* 898c2ecf20Sopenharmony_ci * The generator of dirty data starts writeback at this percentage 908c2ecf20Sopenharmony_ci */ 918c2ecf20Sopenharmony_ciint vm_dirty_ratio = 20; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci/* 948c2ecf20Sopenharmony_ci * vm_dirty_bytes starts at 0 (disabled) so that it is a function of 958c2ecf20Sopenharmony_ci * vm_dirty_ratio * the amount of dirtyable memory 968c2ecf20Sopenharmony_ci */ 978c2ecf20Sopenharmony_ciunsigned long vm_dirty_bytes; 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci/* 1008c2ecf20Sopenharmony_ci * The interval between `kupdate'-style writebacks 1018c2ecf20Sopenharmony_ci */ 1028c2ecf20Sopenharmony_ciunsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(dirty_writeback_interval); 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci/* 1078c2ecf20Sopenharmony_ci * The longest time for which data is allowed to remain dirty 1088c2ecf20Sopenharmony_ci */ 1098c2ecf20Sopenharmony_ciunsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci/* 1128c2ecf20Sopenharmony_ci * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: 1138c2ecf20Sopenharmony_ci * a full sync is triggered after this time elapses without any disk activity. 1148c2ecf20Sopenharmony_ci */ 1158c2ecf20Sopenharmony_ciint laptop_mode; 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ciEXPORT_SYMBOL(laptop_mode); 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_ci/* End of sysctl-exported parameters */ 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_cistruct wb_domain global_wb_domain; 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci/* consolidated parameters for balance_dirty_pages() and its subroutines */ 1248c2ecf20Sopenharmony_cistruct dirty_throttle_control { 1258c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 1268c2ecf20Sopenharmony_ci struct wb_domain *dom; 1278c2ecf20Sopenharmony_ci struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ 1288c2ecf20Sopenharmony_ci#endif 1298c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 1308c2ecf20Sopenharmony_ci struct fprop_local_percpu *wb_completions; 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci unsigned long avail; /* dirtyable */ 1338c2ecf20Sopenharmony_ci unsigned long dirty; /* file_dirty + write + nfs */ 1348c2ecf20Sopenharmony_ci unsigned long thresh; /* dirty threshold */ 1358c2ecf20Sopenharmony_ci unsigned long bg_thresh; /* dirty background threshold */ 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci unsigned long wb_dirty; /* per-wb counterparts */ 1388c2ecf20Sopenharmony_ci unsigned long wb_thresh; 1398c2ecf20Sopenharmony_ci unsigned long wb_bg_thresh; 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci unsigned long pos_ratio; 1428c2ecf20Sopenharmony_ci}; 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci/* 1458c2ecf20Sopenharmony_ci * Length of period for aging writeout fractions of bdis. This is an 1468c2ecf20Sopenharmony_ci * arbitrarily chosen number. The longer the period, the slower fractions will 1478c2ecf20Sopenharmony_ci * reflect changes in current writeout rate. 1488c2ecf20Sopenharmony_ci */ 1498c2ecf20Sopenharmony_ci#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci#define GDTC_INIT(__wb) .wb = (__wb), \ 1548c2ecf20Sopenharmony_ci .dom = &global_wb_domain, \ 1558c2ecf20Sopenharmony_ci .wb_completions = &(__wb)->completions 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci#define GDTC_INIT_NO_WB .dom = &global_wb_domain 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ 1608c2ecf20Sopenharmony_ci .dom = mem_cgroup_wb_domain(__wb), \ 1618c2ecf20Sopenharmony_ci .wb_completions = &(__wb)->memcg_completions, \ 1628c2ecf20Sopenharmony_ci .gdtc = __gdtc 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_cistatic bool mdtc_valid(struct dirty_throttle_control *dtc) 1658c2ecf20Sopenharmony_ci{ 1668c2ecf20Sopenharmony_ci return dtc->dom; 1678c2ecf20Sopenharmony_ci} 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_cistatic struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) 1708c2ecf20Sopenharmony_ci{ 1718c2ecf20Sopenharmony_ci return dtc->dom; 1728c2ecf20Sopenharmony_ci} 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_cistatic struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) 1758c2ecf20Sopenharmony_ci{ 1768c2ecf20Sopenharmony_ci return mdtc->gdtc; 1778c2ecf20Sopenharmony_ci} 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_cistatic struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) 1808c2ecf20Sopenharmony_ci{ 1818c2ecf20Sopenharmony_ci return &wb->memcg_completions; 1828c2ecf20Sopenharmony_ci} 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_cistatic void wb_min_max_ratio(struct bdi_writeback *wb, 1858c2ecf20Sopenharmony_ci unsigned long *minp, unsigned long *maxp) 1868c2ecf20Sopenharmony_ci{ 1878c2ecf20Sopenharmony_ci unsigned long this_bw = wb->avg_write_bandwidth; 1888c2ecf20Sopenharmony_ci unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); 1898c2ecf20Sopenharmony_ci unsigned long long min = wb->bdi->min_ratio; 1908c2ecf20Sopenharmony_ci unsigned long long max = wb->bdi->max_ratio; 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci /* 1938c2ecf20Sopenharmony_ci * @wb may already be clean by the time control reaches here and 1948c2ecf20Sopenharmony_ci * the total may not include its bw. 1958c2ecf20Sopenharmony_ci */ 1968c2ecf20Sopenharmony_ci if (this_bw < tot_bw) { 1978c2ecf20Sopenharmony_ci if (min) { 1988c2ecf20Sopenharmony_ci min *= this_bw; 1998c2ecf20Sopenharmony_ci min = div64_ul(min, tot_bw); 2008c2ecf20Sopenharmony_ci } 2018c2ecf20Sopenharmony_ci if (max < 100) { 2028c2ecf20Sopenharmony_ci max *= this_bw; 2038c2ecf20Sopenharmony_ci max = div64_ul(max, tot_bw); 2048c2ecf20Sopenharmony_ci } 2058c2ecf20Sopenharmony_ci } 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci *minp = min; 2088c2ecf20Sopenharmony_ci *maxp = max; 2098c2ecf20Sopenharmony_ci} 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci#else /* CONFIG_CGROUP_WRITEBACK */ 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci#define GDTC_INIT(__wb) .wb = (__wb), \ 2148c2ecf20Sopenharmony_ci .wb_completions = &(__wb)->completions 2158c2ecf20Sopenharmony_ci#define GDTC_INIT_NO_WB 2168c2ecf20Sopenharmony_ci#define MDTC_INIT(__wb, __gdtc) 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_cistatic bool mdtc_valid(struct dirty_throttle_control *dtc) 2198c2ecf20Sopenharmony_ci{ 2208c2ecf20Sopenharmony_ci return false; 2218c2ecf20Sopenharmony_ci} 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_cistatic struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) 2248c2ecf20Sopenharmony_ci{ 2258c2ecf20Sopenharmony_ci return &global_wb_domain; 2268c2ecf20Sopenharmony_ci} 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_cistatic struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) 2298c2ecf20Sopenharmony_ci{ 2308c2ecf20Sopenharmony_ci return NULL; 2318c2ecf20Sopenharmony_ci} 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_cistatic struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) 2348c2ecf20Sopenharmony_ci{ 2358c2ecf20Sopenharmony_ci return NULL; 2368c2ecf20Sopenharmony_ci} 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_cistatic void wb_min_max_ratio(struct bdi_writeback *wb, 2398c2ecf20Sopenharmony_ci unsigned long *minp, unsigned long *maxp) 2408c2ecf20Sopenharmony_ci{ 2418c2ecf20Sopenharmony_ci *minp = wb->bdi->min_ratio; 2428c2ecf20Sopenharmony_ci *maxp = wb->bdi->max_ratio; 2438c2ecf20Sopenharmony_ci} 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci#endif /* CONFIG_CGROUP_WRITEBACK */ 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci/* 2488c2ecf20Sopenharmony_ci * In a memory zone, there is a certain amount of pages we consider 2498c2ecf20Sopenharmony_ci * available for the page cache, which is essentially the number of 2508c2ecf20Sopenharmony_ci * free and reclaimable pages, minus some zone reserves to protect 2518c2ecf20Sopenharmony_ci * lowmem and the ability to uphold the zone's watermarks without 2528c2ecf20Sopenharmony_ci * requiring writeback. 2538c2ecf20Sopenharmony_ci * 2548c2ecf20Sopenharmony_ci * This number of dirtyable pages is the base value of which the 2558c2ecf20Sopenharmony_ci * user-configurable dirty ratio is the effective number of pages that 2568c2ecf20Sopenharmony_ci * are allowed to be actually dirtied. Per individual zone, or 2578c2ecf20Sopenharmony_ci * globally by using the sum of dirtyable pages over all zones. 2588c2ecf20Sopenharmony_ci * 2598c2ecf20Sopenharmony_ci * Because the user is allowed to specify the dirty limit globally as 2608c2ecf20Sopenharmony_ci * absolute number of bytes, calculating the per-zone dirty limit can 2618c2ecf20Sopenharmony_ci * require translating the configured limit into a percentage of 2628c2ecf20Sopenharmony_ci * global dirtyable memory first. 2638c2ecf20Sopenharmony_ci */ 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci/** 2668c2ecf20Sopenharmony_ci * node_dirtyable_memory - number of dirtyable pages in a node 2678c2ecf20Sopenharmony_ci * @pgdat: the node 2688c2ecf20Sopenharmony_ci * 2698c2ecf20Sopenharmony_ci * Return: the node's number of pages potentially available for dirty 2708c2ecf20Sopenharmony_ci * page cache. This is the base value for the per-node dirty limits. 2718c2ecf20Sopenharmony_ci */ 2728c2ecf20Sopenharmony_cistatic unsigned long node_dirtyable_memory(struct pglist_data *pgdat) 2738c2ecf20Sopenharmony_ci{ 2748c2ecf20Sopenharmony_ci unsigned long nr_pages = 0; 2758c2ecf20Sopenharmony_ci int z; 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci for (z = 0; z < MAX_NR_ZONES; z++) { 2788c2ecf20Sopenharmony_ci struct zone *zone = pgdat->node_zones + z; 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci if (!populated_zone(zone)) 2818c2ecf20Sopenharmony_ci continue; 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci nr_pages += zone_page_state(zone, NR_FREE_PAGES); 2848c2ecf20Sopenharmony_ci } 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci /* 2878c2ecf20Sopenharmony_ci * Pages reserved for the kernel should not be considered 2888c2ecf20Sopenharmony_ci * dirtyable, to prevent a situation where reclaim has to 2898c2ecf20Sopenharmony_ci * clean pages in order to balance the zones. 2908c2ecf20Sopenharmony_ci */ 2918c2ecf20Sopenharmony_ci nr_pages -= min(nr_pages, pgdat->totalreserve_pages); 2928c2ecf20Sopenharmony_ci 2938c2ecf20Sopenharmony_ci nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); 2948c2ecf20Sopenharmony_ci nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci return nr_pages; 2978c2ecf20Sopenharmony_ci} 2988c2ecf20Sopenharmony_ci 2998c2ecf20Sopenharmony_cistatic unsigned long highmem_dirtyable_memory(unsigned long total) 3008c2ecf20Sopenharmony_ci{ 3018c2ecf20Sopenharmony_ci#ifdef CONFIG_HIGHMEM 3028c2ecf20Sopenharmony_ci int node; 3038c2ecf20Sopenharmony_ci unsigned long x = 0; 3048c2ecf20Sopenharmony_ci int i; 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci for_each_node_state(node, N_HIGH_MEMORY) { 3078c2ecf20Sopenharmony_ci for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { 3088c2ecf20Sopenharmony_ci struct zone *z; 3098c2ecf20Sopenharmony_ci unsigned long nr_pages; 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci if (!is_highmem_idx(i)) 3128c2ecf20Sopenharmony_ci continue; 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_ci z = &NODE_DATA(node)->node_zones[i]; 3158c2ecf20Sopenharmony_ci if (!populated_zone(z)) 3168c2ecf20Sopenharmony_ci continue; 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci nr_pages = zone_page_state(z, NR_FREE_PAGES); 3198c2ecf20Sopenharmony_ci /* watch for underflows */ 3208c2ecf20Sopenharmony_ci nr_pages -= min(nr_pages, high_wmark_pages(z)); 3218c2ecf20Sopenharmony_ci nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); 3228c2ecf20Sopenharmony_ci nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); 3238c2ecf20Sopenharmony_ci x += nr_pages; 3248c2ecf20Sopenharmony_ci } 3258c2ecf20Sopenharmony_ci } 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci /* 3288c2ecf20Sopenharmony_ci * Unreclaimable memory (kernel memory or anonymous memory 3298c2ecf20Sopenharmony_ci * without swap) can bring down the dirtyable pages below 3308c2ecf20Sopenharmony_ci * the zone's dirty balance reserve and the above calculation 3318c2ecf20Sopenharmony_ci * will underflow. However we still want to add in nodes 3328c2ecf20Sopenharmony_ci * which are below threshold (negative values) to get a more 3338c2ecf20Sopenharmony_ci * accurate calculation but make sure that the total never 3348c2ecf20Sopenharmony_ci * underflows. 3358c2ecf20Sopenharmony_ci */ 3368c2ecf20Sopenharmony_ci if ((long)x < 0) 3378c2ecf20Sopenharmony_ci x = 0; 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_ci /* 3408c2ecf20Sopenharmony_ci * Make sure that the number of highmem pages is never larger 3418c2ecf20Sopenharmony_ci * than the number of the total dirtyable memory. This can only 3428c2ecf20Sopenharmony_ci * occur in very strange VM situations but we want to make sure 3438c2ecf20Sopenharmony_ci * that this does not occur. 3448c2ecf20Sopenharmony_ci */ 3458c2ecf20Sopenharmony_ci return min(x, total); 3468c2ecf20Sopenharmony_ci#else 3478c2ecf20Sopenharmony_ci return 0; 3488c2ecf20Sopenharmony_ci#endif 3498c2ecf20Sopenharmony_ci} 3508c2ecf20Sopenharmony_ci 3518c2ecf20Sopenharmony_ci/** 3528c2ecf20Sopenharmony_ci * global_dirtyable_memory - number of globally dirtyable pages 3538c2ecf20Sopenharmony_ci * 3548c2ecf20Sopenharmony_ci * Return: the global number of pages potentially available for dirty 3558c2ecf20Sopenharmony_ci * page cache. This is the base value for the global dirty limits. 3568c2ecf20Sopenharmony_ci */ 3578c2ecf20Sopenharmony_cistatic unsigned long global_dirtyable_memory(void) 3588c2ecf20Sopenharmony_ci{ 3598c2ecf20Sopenharmony_ci unsigned long x; 3608c2ecf20Sopenharmony_ci 3618c2ecf20Sopenharmony_ci x = global_zone_page_state(NR_FREE_PAGES); 3628c2ecf20Sopenharmony_ci /* 3638c2ecf20Sopenharmony_ci * Pages reserved for the kernel should not be considered 3648c2ecf20Sopenharmony_ci * dirtyable, to prevent a situation where reclaim has to 3658c2ecf20Sopenharmony_ci * clean pages in order to balance the zones. 3668c2ecf20Sopenharmony_ci */ 3678c2ecf20Sopenharmony_ci x -= min(x, totalreserve_pages); 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_ci x += global_node_page_state(NR_INACTIVE_FILE); 3708c2ecf20Sopenharmony_ci x += global_node_page_state(NR_ACTIVE_FILE); 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_ci if (!vm_highmem_is_dirtyable) 3738c2ecf20Sopenharmony_ci x -= highmem_dirtyable_memory(x); 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci return x + 1; /* Ensure that we never return 0 */ 3768c2ecf20Sopenharmony_ci} 3778c2ecf20Sopenharmony_ci 3788c2ecf20Sopenharmony_ci/** 3798c2ecf20Sopenharmony_ci * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain 3808c2ecf20Sopenharmony_ci * @dtc: dirty_throttle_control of interest 3818c2ecf20Sopenharmony_ci * 3828c2ecf20Sopenharmony_ci * Calculate @dtc->thresh and ->bg_thresh considering 3838c2ecf20Sopenharmony_ci * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller 3848c2ecf20Sopenharmony_ci * must ensure that @dtc->avail is set before calling this function. The 3858c2ecf20Sopenharmony_ci * dirty limits will be lifted by 1/4 for real-time tasks. 3868c2ecf20Sopenharmony_ci */ 3878c2ecf20Sopenharmony_cistatic void domain_dirty_limits(struct dirty_throttle_control *dtc) 3888c2ecf20Sopenharmony_ci{ 3898c2ecf20Sopenharmony_ci const unsigned long available_memory = dtc->avail; 3908c2ecf20Sopenharmony_ci struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); 3918c2ecf20Sopenharmony_ci unsigned long bytes = vm_dirty_bytes; 3928c2ecf20Sopenharmony_ci unsigned long bg_bytes = dirty_background_bytes; 3938c2ecf20Sopenharmony_ci /* convert ratios to per-PAGE_SIZE for higher precision */ 3948c2ecf20Sopenharmony_ci unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; 3958c2ecf20Sopenharmony_ci unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; 3968c2ecf20Sopenharmony_ci unsigned long thresh; 3978c2ecf20Sopenharmony_ci unsigned long bg_thresh; 3988c2ecf20Sopenharmony_ci struct task_struct *tsk; 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci /* gdtc is !NULL iff @dtc is for memcg domain */ 4018c2ecf20Sopenharmony_ci if (gdtc) { 4028c2ecf20Sopenharmony_ci unsigned long global_avail = gdtc->avail; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci /* 4058c2ecf20Sopenharmony_ci * The byte settings can't be applied directly to memcg 4068c2ecf20Sopenharmony_ci * domains. Convert them to ratios by scaling against 4078c2ecf20Sopenharmony_ci * globally available memory. As the ratios are in 4088c2ecf20Sopenharmony_ci * per-PAGE_SIZE, they can be obtained by dividing bytes by 4098c2ecf20Sopenharmony_ci * number of pages. 4108c2ecf20Sopenharmony_ci */ 4118c2ecf20Sopenharmony_ci if (bytes) 4128c2ecf20Sopenharmony_ci ratio = min(DIV_ROUND_UP(bytes, global_avail), 4138c2ecf20Sopenharmony_ci PAGE_SIZE); 4148c2ecf20Sopenharmony_ci if (bg_bytes) 4158c2ecf20Sopenharmony_ci bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), 4168c2ecf20Sopenharmony_ci PAGE_SIZE); 4178c2ecf20Sopenharmony_ci bytes = bg_bytes = 0; 4188c2ecf20Sopenharmony_ci } 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci if (bytes) 4218c2ecf20Sopenharmony_ci thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); 4228c2ecf20Sopenharmony_ci else 4238c2ecf20Sopenharmony_ci thresh = (ratio * available_memory) / PAGE_SIZE; 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci if (bg_bytes) 4268c2ecf20Sopenharmony_ci bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); 4278c2ecf20Sopenharmony_ci else 4288c2ecf20Sopenharmony_ci bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci tsk = current; 4318c2ecf20Sopenharmony_ci if (rt_task(tsk)) { 4328c2ecf20Sopenharmony_ci bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; 4338c2ecf20Sopenharmony_ci thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; 4348c2ecf20Sopenharmony_ci } 4358c2ecf20Sopenharmony_ci /* 4368c2ecf20Sopenharmony_ci * Dirty throttling logic assumes the limits in page units fit into 4378c2ecf20Sopenharmony_ci * 32-bits. This gives 16TB dirty limits max which is hopefully enough. 4388c2ecf20Sopenharmony_ci */ 4398c2ecf20Sopenharmony_ci if (thresh > UINT_MAX) 4408c2ecf20Sopenharmony_ci thresh = UINT_MAX; 4418c2ecf20Sopenharmony_ci /* This makes sure bg_thresh is within 32-bits as well */ 4428c2ecf20Sopenharmony_ci if (bg_thresh >= thresh) 4438c2ecf20Sopenharmony_ci bg_thresh = thresh / 2; 4448c2ecf20Sopenharmony_ci dtc->thresh = thresh; 4458c2ecf20Sopenharmony_ci dtc->bg_thresh = bg_thresh; 4468c2ecf20Sopenharmony_ci 4478c2ecf20Sopenharmony_ci /* we should eventually report the domain in the TP */ 4488c2ecf20Sopenharmony_ci if (!gdtc) 4498c2ecf20Sopenharmony_ci trace_global_dirty_state(bg_thresh, thresh); 4508c2ecf20Sopenharmony_ci} 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci/** 4538c2ecf20Sopenharmony_ci * global_dirty_limits - background-writeback and dirty-throttling thresholds 4548c2ecf20Sopenharmony_ci * @pbackground: out parameter for bg_thresh 4558c2ecf20Sopenharmony_ci * @pdirty: out parameter for thresh 4568c2ecf20Sopenharmony_ci * 4578c2ecf20Sopenharmony_ci * Calculate bg_thresh and thresh for global_wb_domain. See 4588c2ecf20Sopenharmony_ci * domain_dirty_limits() for details. 4598c2ecf20Sopenharmony_ci */ 4608c2ecf20Sopenharmony_civoid global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) 4618c2ecf20Sopenharmony_ci{ 4628c2ecf20Sopenharmony_ci struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_ci gdtc.avail = global_dirtyable_memory(); 4658c2ecf20Sopenharmony_ci domain_dirty_limits(&gdtc); 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci *pbackground = gdtc.bg_thresh; 4688c2ecf20Sopenharmony_ci *pdirty = gdtc.thresh; 4698c2ecf20Sopenharmony_ci} 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci/** 4728c2ecf20Sopenharmony_ci * node_dirty_limit - maximum number of dirty pages allowed in a node 4738c2ecf20Sopenharmony_ci * @pgdat: the node 4748c2ecf20Sopenharmony_ci * 4758c2ecf20Sopenharmony_ci * Return: the maximum number of dirty pages allowed in a node, based 4768c2ecf20Sopenharmony_ci * on the node's dirtyable memory. 4778c2ecf20Sopenharmony_ci */ 4788c2ecf20Sopenharmony_cistatic unsigned long node_dirty_limit(struct pglist_data *pgdat) 4798c2ecf20Sopenharmony_ci{ 4808c2ecf20Sopenharmony_ci unsigned long node_memory = node_dirtyable_memory(pgdat); 4818c2ecf20Sopenharmony_ci struct task_struct *tsk = current; 4828c2ecf20Sopenharmony_ci unsigned long dirty; 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ci if (vm_dirty_bytes) 4858c2ecf20Sopenharmony_ci dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * 4868c2ecf20Sopenharmony_ci node_memory / global_dirtyable_memory(); 4878c2ecf20Sopenharmony_ci else 4888c2ecf20Sopenharmony_ci dirty = vm_dirty_ratio * node_memory / 100; 4898c2ecf20Sopenharmony_ci 4908c2ecf20Sopenharmony_ci if (rt_task(tsk)) 4918c2ecf20Sopenharmony_ci dirty += dirty / 4; 4928c2ecf20Sopenharmony_ci 4938c2ecf20Sopenharmony_ci /* 4948c2ecf20Sopenharmony_ci * Dirty throttling logic assumes the limits in page units fit into 4958c2ecf20Sopenharmony_ci * 32-bits. This gives 16TB dirty limits max which is hopefully enough. 4968c2ecf20Sopenharmony_ci */ 4978c2ecf20Sopenharmony_ci return min_t(unsigned long, dirty, UINT_MAX); 4988c2ecf20Sopenharmony_ci} 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci/** 5018c2ecf20Sopenharmony_ci * node_dirty_ok - tells whether a node is within its dirty limits 5028c2ecf20Sopenharmony_ci * @pgdat: the node to check 5038c2ecf20Sopenharmony_ci * 5048c2ecf20Sopenharmony_ci * Return: %true when the dirty pages in @pgdat are within the node's 5058c2ecf20Sopenharmony_ci * dirty limit, %false if the limit is exceeded. 5068c2ecf20Sopenharmony_ci */ 5078c2ecf20Sopenharmony_cibool node_dirty_ok(struct pglist_data *pgdat) 5088c2ecf20Sopenharmony_ci{ 5098c2ecf20Sopenharmony_ci unsigned long limit = node_dirty_limit(pgdat); 5108c2ecf20Sopenharmony_ci unsigned long nr_pages = 0; 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); 5138c2ecf20Sopenharmony_ci nr_pages += node_page_state(pgdat, NR_WRITEBACK); 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci return nr_pages <= limit; 5168c2ecf20Sopenharmony_ci} 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ciint dirty_background_ratio_handler(struct ctl_table *table, int write, 5198c2ecf20Sopenharmony_ci void *buffer, size_t *lenp, loff_t *ppos) 5208c2ecf20Sopenharmony_ci{ 5218c2ecf20Sopenharmony_ci int ret; 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 5248c2ecf20Sopenharmony_ci if (ret == 0 && write) 5258c2ecf20Sopenharmony_ci dirty_background_bytes = 0; 5268c2ecf20Sopenharmony_ci return ret; 5278c2ecf20Sopenharmony_ci} 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ciint dirty_background_bytes_handler(struct ctl_table *table, int write, 5308c2ecf20Sopenharmony_ci void *buffer, size_t *lenp, loff_t *ppos) 5318c2ecf20Sopenharmony_ci{ 5328c2ecf20Sopenharmony_ci int ret; 5338c2ecf20Sopenharmony_ci unsigned long old_bytes = dirty_background_bytes; 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ci ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 5368c2ecf20Sopenharmony_ci if (ret == 0 && write) { 5378c2ecf20Sopenharmony_ci if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > 5388c2ecf20Sopenharmony_ci UINT_MAX) { 5398c2ecf20Sopenharmony_ci dirty_background_bytes = old_bytes; 5408c2ecf20Sopenharmony_ci return -ERANGE; 5418c2ecf20Sopenharmony_ci } 5428c2ecf20Sopenharmony_ci dirty_background_ratio = 0; 5438c2ecf20Sopenharmony_ci } 5448c2ecf20Sopenharmony_ci return ret; 5458c2ecf20Sopenharmony_ci} 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ciint dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, 5488c2ecf20Sopenharmony_ci size_t *lenp, loff_t *ppos) 5498c2ecf20Sopenharmony_ci{ 5508c2ecf20Sopenharmony_ci int old_ratio = vm_dirty_ratio; 5518c2ecf20Sopenharmony_ci int ret; 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_ci ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 5548c2ecf20Sopenharmony_ci if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 5558c2ecf20Sopenharmony_ci writeback_set_ratelimit(); 5568c2ecf20Sopenharmony_ci vm_dirty_bytes = 0; 5578c2ecf20Sopenharmony_ci } 5588c2ecf20Sopenharmony_ci return ret; 5598c2ecf20Sopenharmony_ci} 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_ciint dirty_bytes_handler(struct ctl_table *table, int write, 5628c2ecf20Sopenharmony_ci void *buffer, size_t *lenp, loff_t *ppos) 5638c2ecf20Sopenharmony_ci{ 5648c2ecf20Sopenharmony_ci unsigned long old_bytes = vm_dirty_bytes; 5658c2ecf20Sopenharmony_ci int ret; 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_ci ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 5688c2ecf20Sopenharmony_ci if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 5698c2ecf20Sopenharmony_ci if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { 5708c2ecf20Sopenharmony_ci vm_dirty_bytes = old_bytes; 5718c2ecf20Sopenharmony_ci return -ERANGE; 5728c2ecf20Sopenharmony_ci } 5738c2ecf20Sopenharmony_ci writeback_set_ratelimit(); 5748c2ecf20Sopenharmony_ci vm_dirty_ratio = 0; 5758c2ecf20Sopenharmony_ci } 5768c2ecf20Sopenharmony_ci return ret; 5778c2ecf20Sopenharmony_ci} 5788c2ecf20Sopenharmony_ci 5798c2ecf20Sopenharmony_cistatic unsigned long wp_next_time(unsigned long cur_time) 5808c2ecf20Sopenharmony_ci{ 5818c2ecf20Sopenharmony_ci cur_time += VM_COMPLETIONS_PERIOD_LEN; 5828c2ecf20Sopenharmony_ci /* 0 has a special meaning... */ 5838c2ecf20Sopenharmony_ci if (!cur_time) 5848c2ecf20Sopenharmony_ci return 1; 5858c2ecf20Sopenharmony_ci return cur_time; 5868c2ecf20Sopenharmony_ci} 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_cistatic void wb_domain_writeout_inc(struct wb_domain *dom, 5898c2ecf20Sopenharmony_ci struct fprop_local_percpu *completions, 5908c2ecf20Sopenharmony_ci unsigned int max_prop_frac) 5918c2ecf20Sopenharmony_ci{ 5928c2ecf20Sopenharmony_ci __fprop_inc_percpu_max(&dom->completions, completions, 5938c2ecf20Sopenharmony_ci max_prop_frac); 5948c2ecf20Sopenharmony_ci /* First event after period switching was turned off? */ 5958c2ecf20Sopenharmony_ci if (unlikely(!dom->period_time)) { 5968c2ecf20Sopenharmony_ci /* 5978c2ecf20Sopenharmony_ci * We can race with other __bdi_writeout_inc calls here but 5988c2ecf20Sopenharmony_ci * it does not cause any harm since the resulting time when 5998c2ecf20Sopenharmony_ci * timer will fire and what is in writeout_period_time will be 6008c2ecf20Sopenharmony_ci * roughly the same. 6018c2ecf20Sopenharmony_ci */ 6028c2ecf20Sopenharmony_ci dom->period_time = wp_next_time(jiffies); 6038c2ecf20Sopenharmony_ci mod_timer(&dom->period_timer, dom->period_time); 6048c2ecf20Sopenharmony_ci } 6058c2ecf20Sopenharmony_ci} 6068c2ecf20Sopenharmony_ci 6078c2ecf20Sopenharmony_ci/* 6088c2ecf20Sopenharmony_ci * Increment @wb's writeout completion count and the global writeout 6098c2ecf20Sopenharmony_ci * completion count. Called from test_clear_page_writeback(). 6108c2ecf20Sopenharmony_ci */ 6118c2ecf20Sopenharmony_cistatic inline void __wb_writeout_inc(struct bdi_writeback *wb) 6128c2ecf20Sopenharmony_ci{ 6138c2ecf20Sopenharmony_ci struct wb_domain *cgdom; 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_ci inc_wb_stat(wb, WB_WRITTEN); 6168c2ecf20Sopenharmony_ci wb_domain_writeout_inc(&global_wb_domain, &wb->completions, 6178c2ecf20Sopenharmony_ci wb->bdi->max_prop_frac); 6188c2ecf20Sopenharmony_ci 6198c2ecf20Sopenharmony_ci cgdom = mem_cgroup_wb_domain(wb); 6208c2ecf20Sopenharmony_ci if (cgdom) 6218c2ecf20Sopenharmony_ci wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), 6228c2ecf20Sopenharmony_ci wb->bdi->max_prop_frac); 6238c2ecf20Sopenharmony_ci} 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_civoid wb_writeout_inc(struct bdi_writeback *wb) 6268c2ecf20Sopenharmony_ci{ 6278c2ecf20Sopenharmony_ci unsigned long flags; 6288c2ecf20Sopenharmony_ci 6298c2ecf20Sopenharmony_ci local_irq_save(flags); 6308c2ecf20Sopenharmony_ci __wb_writeout_inc(wb); 6318c2ecf20Sopenharmony_ci local_irq_restore(flags); 6328c2ecf20Sopenharmony_ci} 6338c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(wb_writeout_inc); 6348c2ecf20Sopenharmony_ci 6358c2ecf20Sopenharmony_ci/* 6368c2ecf20Sopenharmony_ci * On idle system, we can be called long after we scheduled because we use 6378c2ecf20Sopenharmony_ci * deferred timers so count with missed periods. 6388c2ecf20Sopenharmony_ci */ 6398c2ecf20Sopenharmony_cistatic void writeout_period(struct timer_list *t) 6408c2ecf20Sopenharmony_ci{ 6418c2ecf20Sopenharmony_ci struct wb_domain *dom = from_timer(dom, t, period_timer); 6428c2ecf20Sopenharmony_ci int miss_periods = (jiffies - dom->period_time) / 6438c2ecf20Sopenharmony_ci VM_COMPLETIONS_PERIOD_LEN; 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_ci if (fprop_new_period(&dom->completions, miss_periods + 1)) { 6468c2ecf20Sopenharmony_ci dom->period_time = wp_next_time(dom->period_time + 6478c2ecf20Sopenharmony_ci miss_periods * VM_COMPLETIONS_PERIOD_LEN); 6488c2ecf20Sopenharmony_ci mod_timer(&dom->period_timer, dom->period_time); 6498c2ecf20Sopenharmony_ci } else { 6508c2ecf20Sopenharmony_ci /* 6518c2ecf20Sopenharmony_ci * Aging has zeroed all fractions. Stop wasting CPU on period 6528c2ecf20Sopenharmony_ci * updates. 6538c2ecf20Sopenharmony_ci */ 6548c2ecf20Sopenharmony_ci dom->period_time = 0; 6558c2ecf20Sopenharmony_ci } 6568c2ecf20Sopenharmony_ci} 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_ciint wb_domain_init(struct wb_domain *dom, gfp_t gfp) 6598c2ecf20Sopenharmony_ci{ 6608c2ecf20Sopenharmony_ci memset(dom, 0, sizeof(*dom)); 6618c2ecf20Sopenharmony_ci 6628c2ecf20Sopenharmony_ci spin_lock_init(&dom->lock); 6638c2ecf20Sopenharmony_ci 6648c2ecf20Sopenharmony_ci timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_ci dom->dirty_limit_tstamp = jiffies; 6678c2ecf20Sopenharmony_ci 6688c2ecf20Sopenharmony_ci return fprop_global_init(&dom->completions, gfp); 6698c2ecf20Sopenharmony_ci} 6708c2ecf20Sopenharmony_ci 6718c2ecf20Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK 6728c2ecf20Sopenharmony_civoid wb_domain_exit(struct wb_domain *dom) 6738c2ecf20Sopenharmony_ci{ 6748c2ecf20Sopenharmony_ci del_timer_sync(&dom->period_timer); 6758c2ecf20Sopenharmony_ci fprop_global_destroy(&dom->completions); 6768c2ecf20Sopenharmony_ci} 6778c2ecf20Sopenharmony_ci#endif 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci/* 6808c2ecf20Sopenharmony_ci * bdi_min_ratio keeps the sum of the minimum dirty shares of all 6818c2ecf20Sopenharmony_ci * registered backing devices, which, for obvious reasons, can not 6828c2ecf20Sopenharmony_ci * exceed 100%. 6838c2ecf20Sopenharmony_ci */ 6848c2ecf20Sopenharmony_cistatic unsigned int bdi_min_ratio; 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ciint bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 6878c2ecf20Sopenharmony_ci{ 6888c2ecf20Sopenharmony_ci int ret = 0; 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci spin_lock_bh(&bdi_lock); 6918c2ecf20Sopenharmony_ci if (min_ratio > bdi->max_ratio) { 6928c2ecf20Sopenharmony_ci ret = -EINVAL; 6938c2ecf20Sopenharmony_ci } else { 6948c2ecf20Sopenharmony_ci min_ratio -= bdi->min_ratio; 6958c2ecf20Sopenharmony_ci if (bdi_min_ratio + min_ratio < 100) { 6968c2ecf20Sopenharmony_ci bdi_min_ratio += min_ratio; 6978c2ecf20Sopenharmony_ci bdi->min_ratio += min_ratio; 6988c2ecf20Sopenharmony_ci } else { 6998c2ecf20Sopenharmony_ci ret = -EINVAL; 7008c2ecf20Sopenharmony_ci } 7018c2ecf20Sopenharmony_ci } 7028c2ecf20Sopenharmony_ci spin_unlock_bh(&bdi_lock); 7038c2ecf20Sopenharmony_ci 7048c2ecf20Sopenharmony_ci return ret; 7058c2ecf20Sopenharmony_ci} 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ciint bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 7088c2ecf20Sopenharmony_ci{ 7098c2ecf20Sopenharmony_ci int ret = 0; 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci if (max_ratio > 100) 7128c2ecf20Sopenharmony_ci return -EINVAL; 7138c2ecf20Sopenharmony_ci 7148c2ecf20Sopenharmony_ci spin_lock_bh(&bdi_lock); 7158c2ecf20Sopenharmony_ci if (bdi->min_ratio > max_ratio) { 7168c2ecf20Sopenharmony_ci ret = -EINVAL; 7178c2ecf20Sopenharmony_ci } else { 7188c2ecf20Sopenharmony_ci bdi->max_ratio = max_ratio; 7198c2ecf20Sopenharmony_ci bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; 7208c2ecf20Sopenharmony_ci } 7218c2ecf20Sopenharmony_ci spin_unlock_bh(&bdi_lock); 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ci return ret; 7248c2ecf20Sopenharmony_ci} 7258c2ecf20Sopenharmony_ciEXPORT_SYMBOL(bdi_set_max_ratio); 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_cistatic unsigned long dirty_freerun_ceiling(unsigned long thresh, 7288c2ecf20Sopenharmony_ci unsigned long bg_thresh) 7298c2ecf20Sopenharmony_ci{ 7308c2ecf20Sopenharmony_ci return (thresh + bg_thresh) / 2; 7318c2ecf20Sopenharmony_ci} 7328c2ecf20Sopenharmony_ci 7338c2ecf20Sopenharmony_cistatic unsigned long hard_dirty_limit(struct wb_domain *dom, 7348c2ecf20Sopenharmony_ci unsigned long thresh) 7358c2ecf20Sopenharmony_ci{ 7368c2ecf20Sopenharmony_ci return max(thresh, dom->dirty_limit); 7378c2ecf20Sopenharmony_ci} 7388c2ecf20Sopenharmony_ci 7398c2ecf20Sopenharmony_ci/* 7408c2ecf20Sopenharmony_ci * Memory which can be further allocated to a memcg domain is capped by 7418c2ecf20Sopenharmony_ci * system-wide clean memory excluding the amount being used in the domain. 7428c2ecf20Sopenharmony_ci */ 7438c2ecf20Sopenharmony_cistatic void mdtc_calc_avail(struct dirty_throttle_control *mdtc, 7448c2ecf20Sopenharmony_ci unsigned long filepages, unsigned long headroom) 7458c2ecf20Sopenharmony_ci{ 7468c2ecf20Sopenharmony_ci struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); 7478c2ecf20Sopenharmony_ci unsigned long clean = filepages - min(filepages, mdtc->dirty); 7488c2ecf20Sopenharmony_ci unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); 7498c2ecf20Sopenharmony_ci unsigned long other_clean = global_clean - min(global_clean, clean); 7508c2ecf20Sopenharmony_ci 7518c2ecf20Sopenharmony_ci mdtc->avail = filepages + min(headroom, other_clean); 7528c2ecf20Sopenharmony_ci} 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci/** 7558c2ecf20Sopenharmony_ci * __wb_calc_thresh - @wb's share of dirty throttling threshold 7568c2ecf20Sopenharmony_ci * @dtc: dirty_throttle_context of interest 7578c2ecf20Sopenharmony_ci * 7588c2ecf20Sopenharmony_ci * Note that balance_dirty_pages() will only seriously take it as a hard limit 7598c2ecf20Sopenharmony_ci * when sleeping max_pause per page is not enough to keep the dirty pages under 7608c2ecf20Sopenharmony_ci * control. For example, when the device is completely stalled due to some error 7618c2ecf20Sopenharmony_ci * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. 7628c2ecf20Sopenharmony_ci * In the other normal situations, it acts more gently by throttling the tasks 7638c2ecf20Sopenharmony_ci * more (rather than completely block them) when the wb dirty pages go high. 7648c2ecf20Sopenharmony_ci * 7658c2ecf20Sopenharmony_ci * It allocates high/low dirty limits to fast/slow devices, in order to prevent 7668c2ecf20Sopenharmony_ci * - starving fast devices 7678c2ecf20Sopenharmony_ci * - piling up dirty pages (that will take long time to sync) on slow devices 7688c2ecf20Sopenharmony_ci * 7698c2ecf20Sopenharmony_ci * The wb's share of dirty limit will be adapting to its throughput and 7708c2ecf20Sopenharmony_ci * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. 7718c2ecf20Sopenharmony_ci * 7728c2ecf20Sopenharmony_ci * Return: @wb's dirty limit in pages. The term "dirty" in the context of 7738c2ecf20Sopenharmony_ci * dirty balancing includes all PG_dirty and PG_writeback pages. 7748c2ecf20Sopenharmony_ci */ 7758c2ecf20Sopenharmony_cistatic unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) 7768c2ecf20Sopenharmony_ci{ 7778c2ecf20Sopenharmony_ci struct wb_domain *dom = dtc_dom(dtc); 7788c2ecf20Sopenharmony_ci unsigned long thresh = dtc->thresh; 7798c2ecf20Sopenharmony_ci u64 wb_thresh; 7808c2ecf20Sopenharmony_ci unsigned long numerator, denominator; 7818c2ecf20Sopenharmony_ci unsigned long wb_min_ratio, wb_max_ratio; 7828c2ecf20Sopenharmony_ci 7838c2ecf20Sopenharmony_ci /* 7848c2ecf20Sopenharmony_ci * Calculate this BDI's share of the thresh ratio. 7858c2ecf20Sopenharmony_ci */ 7868c2ecf20Sopenharmony_ci fprop_fraction_percpu(&dom->completions, dtc->wb_completions, 7878c2ecf20Sopenharmony_ci &numerator, &denominator); 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; 7908c2ecf20Sopenharmony_ci wb_thresh *= numerator; 7918c2ecf20Sopenharmony_ci wb_thresh = div64_ul(wb_thresh, denominator); 7928c2ecf20Sopenharmony_ci 7938c2ecf20Sopenharmony_ci wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci wb_thresh += (thresh * wb_min_ratio) / 100; 7968c2ecf20Sopenharmony_ci if (wb_thresh > (thresh * wb_max_ratio) / 100) 7978c2ecf20Sopenharmony_ci wb_thresh = thresh * wb_max_ratio / 100; 7988c2ecf20Sopenharmony_ci 7998c2ecf20Sopenharmony_ci return wb_thresh; 8008c2ecf20Sopenharmony_ci} 8018c2ecf20Sopenharmony_ci 8028c2ecf20Sopenharmony_ciunsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) 8038c2ecf20Sopenharmony_ci{ 8048c2ecf20Sopenharmony_ci struct dirty_throttle_control gdtc = { GDTC_INIT(wb), 8058c2ecf20Sopenharmony_ci .thresh = thresh }; 8068c2ecf20Sopenharmony_ci return __wb_calc_thresh(&gdtc); 8078c2ecf20Sopenharmony_ci} 8088c2ecf20Sopenharmony_ci 8098c2ecf20Sopenharmony_ci/* 8108c2ecf20Sopenharmony_ci * setpoint - dirty 3 8118c2ecf20Sopenharmony_ci * f(dirty) := 1.0 + (----------------) 8128c2ecf20Sopenharmony_ci * limit - setpoint 8138c2ecf20Sopenharmony_ci * 8148c2ecf20Sopenharmony_ci * it's a 3rd order polynomial that subjects to 8158c2ecf20Sopenharmony_ci * 8168c2ecf20Sopenharmony_ci * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast 8178c2ecf20Sopenharmony_ci * (2) f(setpoint) = 1.0 => the balance point 8188c2ecf20Sopenharmony_ci * (3) f(limit) = 0 => the hard limit 8198c2ecf20Sopenharmony_ci * (4) df/dx <= 0 => negative feedback control 8208c2ecf20Sopenharmony_ci * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) 8218c2ecf20Sopenharmony_ci * => fast response on large errors; small oscillation near setpoint 8228c2ecf20Sopenharmony_ci */ 8238c2ecf20Sopenharmony_cistatic long long pos_ratio_polynom(unsigned long setpoint, 8248c2ecf20Sopenharmony_ci unsigned long dirty, 8258c2ecf20Sopenharmony_ci unsigned long limit) 8268c2ecf20Sopenharmony_ci{ 8278c2ecf20Sopenharmony_ci long long pos_ratio; 8288c2ecf20Sopenharmony_ci long x; 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, 8318c2ecf20Sopenharmony_ci (limit - setpoint) | 1); 8328c2ecf20Sopenharmony_ci pos_ratio = x; 8338c2ecf20Sopenharmony_ci pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 8348c2ecf20Sopenharmony_ci pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 8358c2ecf20Sopenharmony_ci pos_ratio += 1 << RATELIMIT_CALC_SHIFT; 8368c2ecf20Sopenharmony_ci 8378c2ecf20Sopenharmony_ci return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); 8388c2ecf20Sopenharmony_ci} 8398c2ecf20Sopenharmony_ci 8408c2ecf20Sopenharmony_ci/* 8418c2ecf20Sopenharmony_ci * Dirty position control. 8428c2ecf20Sopenharmony_ci * 8438c2ecf20Sopenharmony_ci * (o) global/bdi setpoints 8448c2ecf20Sopenharmony_ci * 8458c2ecf20Sopenharmony_ci * We want the dirty pages be balanced around the global/wb setpoints. 8468c2ecf20Sopenharmony_ci * When the number of dirty pages is higher/lower than the setpoint, the 8478c2ecf20Sopenharmony_ci * dirty position control ratio (and hence task dirty ratelimit) will be 8488c2ecf20Sopenharmony_ci * decreased/increased to bring the dirty pages back to the setpoint. 8498c2ecf20Sopenharmony_ci * 8508c2ecf20Sopenharmony_ci * pos_ratio = 1 << RATELIMIT_CALC_SHIFT 8518c2ecf20Sopenharmony_ci * 8528c2ecf20Sopenharmony_ci * if (dirty < setpoint) scale up pos_ratio 8538c2ecf20Sopenharmony_ci * if (dirty > setpoint) scale down pos_ratio 8548c2ecf20Sopenharmony_ci * 8558c2ecf20Sopenharmony_ci * if (wb_dirty < wb_setpoint) scale up pos_ratio 8568c2ecf20Sopenharmony_ci * if (wb_dirty > wb_setpoint) scale down pos_ratio 8578c2ecf20Sopenharmony_ci * 8588c2ecf20Sopenharmony_ci * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT 8598c2ecf20Sopenharmony_ci * 8608c2ecf20Sopenharmony_ci * (o) global control line 8618c2ecf20Sopenharmony_ci * 8628c2ecf20Sopenharmony_ci * ^ pos_ratio 8638c2ecf20Sopenharmony_ci * | 8648c2ecf20Sopenharmony_ci * | |<===== global dirty control scope ======>| 8658c2ecf20Sopenharmony_ci * 2.0 .............* 8668c2ecf20Sopenharmony_ci * | .* 8678c2ecf20Sopenharmony_ci * | . * 8688c2ecf20Sopenharmony_ci * | . * 8698c2ecf20Sopenharmony_ci * | . * 8708c2ecf20Sopenharmony_ci * | . * 8718c2ecf20Sopenharmony_ci * | . * 8728c2ecf20Sopenharmony_ci * 1.0 ................................* 8738c2ecf20Sopenharmony_ci * | . . * 8748c2ecf20Sopenharmony_ci * | . . * 8758c2ecf20Sopenharmony_ci * | . . * 8768c2ecf20Sopenharmony_ci * | . . * 8778c2ecf20Sopenharmony_ci * | . . * 8788c2ecf20Sopenharmony_ci * 0 +------------.------------------.----------------------*-------------> 8798c2ecf20Sopenharmony_ci * freerun^ setpoint^ limit^ dirty pages 8808c2ecf20Sopenharmony_ci * 8818c2ecf20Sopenharmony_ci * (o) wb control line 8828c2ecf20Sopenharmony_ci * 8838c2ecf20Sopenharmony_ci * ^ pos_ratio 8848c2ecf20Sopenharmony_ci * | 8858c2ecf20Sopenharmony_ci * | * 8868c2ecf20Sopenharmony_ci * | * 8878c2ecf20Sopenharmony_ci * | * 8888c2ecf20Sopenharmony_ci * | * 8898c2ecf20Sopenharmony_ci * | * |<=========== span ============>| 8908c2ecf20Sopenharmony_ci * 1.0 .......................* 8918c2ecf20Sopenharmony_ci * | . * 8928c2ecf20Sopenharmony_ci * | . * 8938c2ecf20Sopenharmony_ci * | . * 8948c2ecf20Sopenharmony_ci * | . * 8958c2ecf20Sopenharmony_ci * | . * 8968c2ecf20Sopenharmony_ci * | . * 8978c2ecf20Sopenharmony_ci * | . * 8988c2ecf20Sopenharmony_ci * | . * 8998c2ecf20Sopenharmony_ci * | . * 9008c2ecf20Sopenharmony_ci * | . * 9018c2ecf20Sopenharmony_ci * | . * 9028c2ecf20Sopenharmony_ci * 1/4 ...............................................* * * * * * * * * * * * 9038c2ecf20Sopenharmony_ci * | . . 9048c2ecf20Sopenharmony_ci * | . . 9058c2ecf20Sopenharmony_ci * | . . 9068c2ecf20Sopenharmony_ci * 0 +----------------------.-------------------------------.-------------> 9078c2ecf20Sopenharmony_ci * wb_setpoint^ x_intercept^ 9088c2ecf20Sopenharmony_ci * 9098c2ecf20Sopenharmony_ci * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can 9108c2ecf20Sopenharmony_ci * be smoothly throttled down to normal if it starts high in situations like 9118c2ecf20Sopenharmony_ci * - start writing to a slow SD card and a fast disk at the same time. The SD 9128c2ecf20Sopenharmony_ci * card's wb_dirty may rush to many times higher than wb_setpoint. 9138c2ecf20Sopenharmony_ci * - the wb dirty thresh drops quickly due to change of JBOD workload 9148c2ecf20Sopenharmony_ci */ 9158c2ecf20Sopenharmony_cistatic void wb_position_ratio(struct dirty_throttle_control *dtc) 9168c2ecf20Sopenharmony_ci{ 9178c2ecf20Sopenharmony_ci struct bdi_writeback *wb = dtc->wb; 9188c2ecf20Sopenharmony_ci unsigned long write_bw = wb->avg_write_bandwidth; 9198c2ecf20Sopenharmony_ci unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); 9208c2ecf20Sopenharmony_ci unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); 9218c2ecf20Sopenharmony_ci unsigned long wb_thresh = dtc->wb_thresh; 9228c2ecf20Sopenharmony_ci unsigned long x_intercept; 9238c2ecf20Sopenharmony_ci unsigned long setpoint; /* dirty pages' target balance point */ 9248c2ecf20Sopenharmony_ci unsigned long wb_setpoint; 9258c2ecf20Sopenharmony_ci unsigned long span; 9268c2ecf20Sopenharmony_ci long long pos_ratio; /* for scaling up/down the rate limit */ 9278c2ecf20Sopenharmony_ci long x; 9288c2ecf20Sopenharmony_ci 9298c2ecf20Sopenharmony_ci dtc->pos_ratio = 0; 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci if (unlikely(dtc->dirty >= limit)) 9328c2ecf20Sopenharmony_ci return; 9338c2ecf20Sopenharmony_ci 9348c2ecf20Sopenharmony_ci /* 9358c2ecf20Sopenharmony_ci * global setpoint 9368c2ecf20Sopenharmony_ci * 9378c2ecf20Sopenharmony_ci * See comment for pos_ratio_polynom(). 9388c2ecf20Sopenharmony_ci */ 9398c2ecf20Sopenharmony_ci setpoint = (freerun + limit) / 2; 9408c2ecf20Sopenharmony_ci pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_ci /* 9438c2ecf20Sopenharmony_ci * The strictlimit feature is a tool preventing mistrusted filesystems 9448c2ecf20Sopenharmony_ci * from growing a large number of dirty pages before throttling. For 9458c2ecf20Sopenharmony_ci * such filesystems balance_dirty_pages always checks wb counters 9468c2ecf20Sopenharmony_ci * against wb limits. Even if global "nr_dirty" is under "freerun". 9478c2ecf20Sopenharmony_ci * This is especially important for fuse which sets bdi->max_ratio to 9488c2ecf20Sopenharmony_ci * 1% by default. Without strictlimit feature, fuse writeback may 9498c2ecf20Sopenharmony_ci * consume arbitrary amount of RAM because it is accounted in 9508c2ecf20Sopenharmony_ci * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". 9518c2ecf20Sopenharmony_ci * 9528c2ecf20Sopenharmony_ci * Here, in wb_position_ratio(), we calculate pos_ratio based on 9538c2ecf20Sopenharmony_ci * two values: wb_dirty and wb_thresh. Let's consider an example: 9548c2ecf20Sopenharmony_ci * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global 9558c2ecf20Sopenharmony_ci * limits are set by default to 10% and 20% (background and throttle). 9568c2ecf20Sopenharmony_ci * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. 9578c2ecf20Sopenharmony_ci * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is 9588c2ecf20Sopenharmony_ci * about ~6K pages (as the average of background and throttle wb 9598c2ecf20Sopenharmony_ci * limits). The 3rd order polynomial will provide positive feedback if 9608c2ecf20Sopenharmony_ci * wb_dirty is under wb_setpoint and vice versa. 9618c2ecf20Sopenharmony_ci * 9628c2ecf20Sopenharmony_ci * Note, that we cannot use global counters in these calculations 9638c2ecf20Sopenharmony_ci * because we want to throttle process writing to a strictlimit wb 9648c2ecf20Sopenharmony_ci * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB 9658c2ecf20Sopenharmony_ci * in the example above). 9668c2ecf20Sopenharmony_ci */ 9678c2ecf20Sopenharmony_ci if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { 9688c2ecf20Sopenharmony_ci long long wb_pos_ratio; 9698c2ecf20Sopenharmony_ci 9708c2ecf20Sopenharmony_ci if (dtc->wb_dirty < 8) { 9718c2ecf20Sopenharmony_ci dtc->pos_ratio = min_t(long long, pos_ratio * 2, 9728c2ecf20Sopenharmony_ci 2 << RATELIMIT_CALC_SHIFT); 9738c2ecf20Sopenharmony_ci return; 9748c2ecf20Sopenharmony_ci } 9758c2ecf20Sopenharmony_ci 9768c2ecf20Sopenharmony_ci if (dtc->wb_dirty >= wb_thresh) 9778c2ecf20Sopenharmony_ci return; 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci wb_setpoint = dirty_freerun_ceiling(wb_thresh, 9808c2ecf20Sopenharmony_ci dtc->wb_bg_thresh); 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_ci if (wb_setpoint == 0 || wb_setpoint == wb_thresh) 9838c2ecf20Sopenharmony_ci return; 9848c2ecf20Sopenharmony_ci 9858c2ecf20Sopenharmony_ci wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, 9868c2ecf20Sopenharmony_ci wb_thresh); 9878c2ecf20Sopenharmony_ci 9888c2ecf20Sopenharmony_ci /* 9898c2ecf20Sopenharmony_ci * Typically, for strictlimit case, wb_setpoint << setpoint 9908c2ecf20Sopenharmony_ci * and pos_ratio >> wb_pos_ratio. In the other words global 9918c2ecf20Sopenharmony_ci * state ("dirty") is not limiting factor and we have to 9928c2ecf20Sopenharmony_ci * make decision based on wb counters. But there is an 9938c2ecf20Sopenharmony_ci * important case when global pos_ratio should get precedence: 9948c2ecf20Sopenharmony_ci * global limits are exceeded (e.g. due to activities on other 9958c2ecf20Sopenharmony_ci * wb's) while given strictlimit wb is below limit. 9968c2ecf20Sopenharmony_ci * 9978c2ecf20Sopenharmony_ci * "pos_ratio * wb_pos_ratio" would work for the case above, 9988c2ecf20Sopenharmony_ci * but it would look too non-natural for the case of all 9998c2ecf20Sopenharmony_ci * activity in the system coming from a single strictlimit wb 10008c2ecf20Sopenharmony_ci * with bdi->max_ratio == 100%. 10018c2ecf20Sopenharmony_ci * 10028c2ecf20Sopenharmony_ci * Note that min() below somewhat changes the dynamics of the 10038c2ecf20Sopenharmony_ci * control system. Normally, pos_ratio value can be well over 3 10048c2ecf20Sopenharmony_ci * (when globally we are at freerun and wb is well below wb 10058c2ecf20Sopenharmony_ci * setpoint). Now the maximum pos_ratio in the same situation 10068c2ecf20Sopenharmony_ci * is 2. We might want to tweak this if we observe the control 10078c2ecf20Sopenharmony_ci * system is too slow to adapt. 10088c2ecf20Sopenharmony_ci */ 10098c2ecf20Sopenharmony_ci dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); 10108c2ecf20Sopenharmony_ci return; 10118c2ecf20Sopenharmony_ci } 10128c2ecf20Sopenharmony_ci 10138c2ecf20Sopenharmony_ci /* 10148c2ecf20Sopenharmony_ci * We have computed basic pos_ratio above based on global situation. If 10158c2ecf20Sopenharmony_ci * the wb is over/under its share of dirty pages, we want to scale 10168c2ecf20Sopenharmony_ci * pos_ratio further down/up. That is done by the following mechanism. 10178c2ecf20Sopenharmony_ci */ 10188c2ecf20Sopenharmony_ci 10198c2ecf20Sopenharmony_ci /* 10208c2ecf20Sopenharmony_ci * wb setpoint 10218c2ecf20Sopenharmony_ci * 10228c2ecf20Sopenharmony_ci * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) 10238c2ecf20Sopenharmony_ci * 10248c2ecf20Sopenharmony_ci * x_intercept - wb_dirty 10258c2ecf20Sopenharmony_ci * := -------------------------- 10268c2ecf20Sopenharmony_ci * x_intercept - wb_setpoint 10278c2ecf20Sopenharmony_ci * 10288c2ecf20Sopenharmony_ci * The main wb control line is a linear function that subjects to 10298c2ecf20Sopenharmony_ci * 10308c2ecf20Sopenharmony_ci * (1) f(wb_setpoint) = 1.0 10318c2ecf20Sopenharmony_ci * (2) k = - 1 / (8 * write_bw) (in single wb case) 10328c2ecf20Sopenharmony_ci * or equally: x_intercept = wb_setpoint + 8 * write_bw 10338c2ecf20Sopenharmony_ci * 10348c2ecf20Sopenharmony_ci * For single wb case, the dirty pages are observed to fluctuate 10358c2ecf20Sopenharmony_ci * regularly within range 10368c2ecf20Sopenharmony_ci * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] 10378c2ecf20Sopenharmony_ci * for various filesystems, where (2) can yield in a reasonable 12.5% 10388c2ecf20Sopenharmony_ci * fluctuation range for pos_ratio. 10398c2ecf20Sopenharmony_ci * 10408c2ecf20Sopenharmony_ci * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its 10418c2ecf20Sopenharmony_ci * own size, so move the slope over accordingly and choose a slope that 10428c2ecf20Sopenharmony_ci * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. 10438c2ecf20Sopenharmony_ci */ 10448c2ecf20Sopenharmony_ci if (unlikely(wb_thresh > dtc->thresh)) 10458c2ecf20Sopenharmony_ci wb_thresh = dtc->thresh; 10468c2ecf20Sopenharmony_ci /* 10478c2ecf20Sopenharmony_ci * It's very possible that wb_thresh is close to 0 not because the 10488c2ecf20Sopenharmony_ci * device is slow, but that it has remained inactive for long time. 10498c2ecf20Sopenharmony_ci * Honour such devices a reasonable good (hopefully IO efficient) 10508c2ecf20Sopenharmony_ci * threshold, so that the occasional writes won't be blocked and active 10518c2ecf20Sopenharmony_ci * writes can rampup the threshold quickly. 10528c2ecf20Sopenharmony_ci */ 10538c2ecf20Sopenharmony_ci wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); 10548c2ecf20Sopenharmony_ci /* 10558c2ecf20Sopenharmony_ci * scale global setpoint to wb's: 10568c2ecf20Sopenharmony_ci * wb_setpoint = setpoint * wb_thresh / thresh 10578c2ecf20Sopenharmony_ci */ 10588c2ecf20Sopenharmony_ci x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); 10598c2ecf20Sopenharmony_ci wb_setpoint = setpoint * (u64)x >> 16; 10608c2ecf20Sopenharmony_ci /* 10618c2ecf20Sopenharmony_ci * Use span=(8*write_bw) in single wb case as indicated by 10628c2ecf20Sopenharmony_ci * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. 10638c2ecf20Sopenharmony_ci * 10648c2ecf20Sopenharmony_ci * wb_thresh thresh - wb_thresh 10658c2ecf20Sopenharmony_ci * span = --------- * (8 * write_bw) + ------------------ * wb_thresh 10668c2ecf20Sopenharmony_ci * thresh thresh 10678c2ecf20Sopenharmony_ci */ 10688c2ecf20Sopenharmony_ci span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; 10698c2ecf20Sopenharmony_ci x_intercept = wb_setpoint + span; 10708c2ecf20Sopenharmony_ci 10718c2ecf20Sopenharmony_ci if (dtc->wb_dirty < x_intercept - span / 4) { 10728c2ecf20Sopenharmony_ci pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), 10738c2ecf20Sopenharmony_ci (x_intercept - wb_setpoint) | 1); 10748c2ecf20Sopenharmony_ci } else 10758c2ecf20Sopenharmony_ci pos_ratio /= 4; 10768c2ecf20Sopenharmony_ci 10778c2ecf20Sopenharmony_ci /* 10788c2ecf20Sopenharmony_ci * wb reserve area, safeguard against dirty pool underrun and disk idle 10798c2ecf20Sopenharmony_ci * It may push the desired control point of global dirty pages higher 10808c2ecf20Sopenharmony_ci * than setpoint. 10818c2ecf20Sopenharmony_ci */ 10828c2ecf20Sopenharmony_ci x_intercept = wb_thresh / 2; 10838c2ecf20Sopenharmony_ci if (dtc->wb_dirty < x_intercept) { 10848c2ecf20Sopenharmony_ci if (dtc->wb_dirty > x_intercept / 8) 10858c2ecf20Sopenharmony_ci pos_ratio = div_u64(pos_ratio * x_intercept, 10868c2ecf20Sopenharmony_ci dtc->wb_dirty); 10878c2ecf20Sopenharmony_ci else 10888c2ecf20Sopenharmony_ci pos_ratio *= 8; 10898c2ecf20Sopenharmony_ci } 10908c2ecf20Sopenharmony_ci 10918c2ecf20Sopenharmony_ci dtc->pos_ratio = pos_ratio; 10928c2ecf20Sopenharmony_ci} 10938c2ecf20Sopenharmony_ci 10948c2ecf20Sopenharmony_cistatic void wb_update_write_bandwidth(struct bdi_writeback *wb, 10958c2ecf20Sopenharmony_ci unsigned long elapsed, 10968c2ecf20Sopenharmony_ci unsigned long written) 10978c2ecf20Sopenharmony_ci{ 10988c2ecf20Sopenharmony_ci const unsigned long period = roundup_pow_of_two(3 * HZ); 10998c2ecf20Sopenharmony_ci unsigned long avg = wb->avg_write_bandwidth; 11008c2ecf20Sopenharmony_ci unsigned long old = wb->write_bandwidth; 11018c2ecf20Sopenharmony_ci u64 bw; 11028c2ecf20Sopenharmony_ci 11038c2ecf20Sopenharmony_ci /* 11048c2ecf20Sopenharmony_ci * bw = written * HZ / elapsed 11058c2ecf20Sopenharmony_ci * 11068c2ecf20Sopenharmony_ci * bw * elapsed + write_bandwidth * (period - elapsed) 11078c2ecf20Sopenharmony_ci * write_bandwidth = --------------------------------------------------- 11088c2ecf20Sopenharmony_ci * period 11098c2ecf20Sopenharmony_ci * 11108c2ecf20Sopenharmony_ci * @written may have decreased due to account_page_redirty(). 11118c2ecf20Sopenharmony_ci * Avoid underflowing @bw calculation. 11128c2ecf20Sopenharmony_ci */ 11138c2ecf20Sopenharmony_ci bw = written - min(written, wb->written_stamp); 11148c2ecf20Sopenharmony_ci bw *= HZ; 11158c2ecf20Sopenharmony_ci if (unlikely(elapsed > period)) { 11168c2ecf20Sopenharmony_ci bw = div64_ul(bw, elapsed); 11178c2ecf20Sopenharmony_ci avg = bw; 11188c2ecf20Sopenharmony_ci goto out; 11198c2ecf20Sopenharmony_ci } 11208c2ecf20Sopenharmony_ci bw += (u64)wb->write_bandwidth * (period - elapsed); 11218c2ecf20Sopenharmony_ci bw >>= ilog2(period); 11228c2ecf20Sopenharmony_ci 11238c2ecf20Sopenharmony_ci /* 11248c2ecf20Sopenharmony_ci * one more level of smoothing, for filtering out sudden spikes 11258c2ecf20Sopenharmony_ci */ 11268c2ecf20Sopenharmony_ci if (avg > old && old >= (unsigned long)bw) 11278c2ecf20Sopenharmony_ci avg -= (avg - old) >> 3; 11288c2ecf20Sopenharmony_ci 11298c2ecf20Sopenharmony_ci if (avg < old && old <= (unsigned long)bw) 11308c2ecf20Sopenharmony_ci avg += (old - avg) >> 3; 11318c2ecf20Sopenharmony_ci 11328c2ecf20Sopenharmony_ciout: 11338c2ecf20Sopenharmony_ci /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ 11348c2ecf20Sopenharmony_ci avg = max(avg, 1LU); 11358c2ecf20Sopenharmony_ci if (wb_has_dirty_io(wb)) { 11368c2ecf20Sopenharmony_ci long delta = avg - wb->avg_write_bandwidth; 11378c2ecf20Sopenharmony_ci WARN_ON_ONCE(atomic_long_add_return(delta, 11388c2ecf20Sopenharmony_ci &wb->bdi->tot_write_bandwidth) <= 0); 11398c2ecf20Sopenharmony_ci } 11408c2ecf20Sopenharmony_ci wb->write_bandwidth = bw; 11418c2ecf20Sopenharmony_ci wb->avg_write_bandwidth = avg; 11428c2ecf20Sopenharmony_ci} 11438c2ecf20Sopenharmony_ci 11448c2ecf20Sopenharmony_cistatic void update_dirty_limit(struct dirty_throttle_control *dtc) 11458c2ecf20Sopenharmony_ci{ 11468c2ecf20Sopenharmony_ci struct wb_domain *dom = dtc_dom(dtc); 11478c2ecf20Sopenharmony_ci unsigned long thresh = dtc->thresh; 11488c2ecf20Sopenharmony_ci unsigned long limit = dom->dirty_limit; 11498c2ecf20Sopenharmony_ci 11508c2ecf20Sopenharmony_ci /* 11518c2ecf20Sopenharmony_ci * Follow up in one step. 11528c2ecf20Sopenharmony_ci */ 11538c2ecf20Sopenharmony_ci if (limit < thresh) { 11548c2ecf20Sopenharmony_ci limit = thresh; 11558c2ecf20Sopenharmony_ci goto update; 11568c2ecf20Sopenharmony_ci } 11578c2ecf20Sopenharmony_ci 11588c2ecf20Sopenharmony_ci /* 11598c2ecf20Sopenharmony_ci * Follow down slowly. Use the higher one as the target, because thresh 11608c2ecf20Sopenharmony_ci * may drop below dirty. This is exactly the reason to introduce 11618c2ecf20Sopenharmony_ci * dom->dirty_limit which is guaranteed to lie above the dirty pages. 11628c2ecf20Sopenharmony_ci */ 11638c2ecf20Sopenharmony_ci thresh = max(thresh, dtc->dirty); 11648c2ecf20Sopenharmony_ci if (limit > thresh) { 11658c2ecf20Sopenharmony_ci limit -= (limit - thresh) >> 5; 11668c2ecf20Sopenharmony_ci goto update; 11678c2ecf20Sopenharmony_ci } 11688c2ecf20Sopenharmony_ci return; 11698c2ecf20Sopenharmony_ciupdate: 11708c2ecf20Sopenharmony_ci dom->dirty_limit = limit; 11718c2ecf20Sopenharmony_ci} 11728c2ecf20Sopenharmony_ci 11738c2ecf20Sopenharmony_cistatic void domain_update_bandwidth(struct dirty_throttle_control *dtc, 11748c2ecf20Sopenharmony_ci unsigned long now) 11758c2ecf20Sopenharmony_ci{ 11768c2ecf20Sopenharmony_ci struct wb_domain *dom = dtc_dom(dtc); 11778c2ecf20Sopenharmony_ci 11788c2ecf20Sopenharmony_ci /* 11798c2ecf20Sopenharmony_ci * check locklessly first to optimize away locking for the most time 11808c2ecf20Sopenharmony_ci */ 11818c2ecf20Sopenharmony_ci if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) 11828c2ecf20Sopenharmony_ci return; 11838c2ecf20Sopenharmony_ci 11848c2ecf20Sopenharmony_ci spin_lock(&dom->lock); 11858c2ecf20Sopenharmony_ci if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { 11868c2ecf20Sopenharmony_ci update_dirty_limit(dtc); 11878c2ecf20Sopenharmony_ci dom->dirty_limit_tstamp = now; 11888c2ecf20Sopenharmony_ci } 11898c2ecf20Sopenharmony_ci spin_unlock(&dom->lock); 11908c2ecf20Sopenharmony_ci} 11918c2ecf20Sopenharmony_ci 11928c2ecf20Sopenharmony_ci/* 11938c2ecf20Sopenharmony_ci * Maintain wb->dirty_ratelimit, the base dirty throttle rate. 11948c2ecf20Sopenharmony_ci * 11958c2ecf20Sopenharmony_ci * Normal wb tasks will be curbed at or below it in long term. 11968c2ecf20Sopenharmony_ci * Obviously it should be around (write_bw / N) when there are N dd tasks. 11978c2ecf20Sopenharmony_ci */ 11988c2ecf20Sopenharmony_cistatic void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, 11998c2ecf20Sopenharmony_ci unsigned long dirtied, 12008c2ecf20Sopenharmony_ci unsigned long elapsed) 12018c2ecf20Sopenharmony_ci{ 12028c2ecf20Sopenharmony_ci struct bdi_writeback *wb = dtc->wb; 12038c2ecf20Sopenharmony_ci unsigned long dirty = dtc->dirty; 12048c2ecf20Sopenharmony_ci unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); 12058c2ecf20Sopenharmony_ci unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); 12068c2ecf20Sopenharmony_ci unsigned long setpoint = (freerun + limit) / 2; 12078c2ecf20Sopenharmony_ci unsigned long write_bw = wb->avg_write_bandwidth; 12088c2ecf20Sopenharmony_ci unsigned long dirty_ratelimit = wb->dirty_ratelimit; 12098c2ecf20Sopenharmony_ci unsigned long dirty_rate; 12108c2ecf20Sopenharmony_ci unsigned long task_ratelimit; 12118c2ecf20Sopenharmony_ci unsigned long balanced_dirty_ratelimit; 12128c2ecf20Sopenharmony_ci unsigned long step; 12138c2ecf20Sopenharmony_ci unsigned long x; 12148c2ecf20Sopenharmony_ci unsigned long shift; 12158c2ecf20Sopenharmony_ci 12168c2ecf20Sopenharmony_ci /* 12178c2ecf20Sopenharmony_ci * The dirty rate will match the writeout rate in long term, except 12188c2ecf20Sopenharmony_ci * when dirty pages are truncated by userspace or re-dirtied by FS. 12198c2ecf20Sopenharmony_ci */ 12208c2ecf20Sopenharmony_ci dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; 12218c2ecf20Sopenharmony_ci 12228c2ecf20Sopenharmony_ci /* 12238c2ecf20Sopenharmony_ci * task_ratelimit reflects each dd's dirty rate for the past 200ms. 12248c2ecf20Sopenharmony_ci */ 12258c2ecf20Sopenharmony_ci task_ratelimit = (u64)dirty_ratelimit * 12268c2ecf20Sopenharmony_ci dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; 12278c2ecf20Sopenharmony_ci task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ 12288c2ecf20Sopenharmony_ci 12298c2ecf20Sopenharmony_ci /* 12308c2ecf20Sopenharmony_ci * A linear estimation of the "balanced" throttle rate. The theory is, 12318c2ecf20Sopenharmony_ci * if there are N dd tasks, each throttled at task_ratelimit, the wb's 12328c2ecf20Sopenharmony_ci * dirty_rate will be measured to be (N * task_ratelimit). So the below 12338c2ecf20Sopenharmony_ci * formula will yield the balanced rate limit (write_bw / N). 12348c2ecf20Sopenharmony_ci * 12358c2ecf20Sopenharmony_ci * Note that the expanded form is not a pure rate feedback: 12368c2ecf20Sopenharmony_ci * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) 12378c2ecf20Sopenharmony_ci * but also takes pos_ratio into account: 12388c2ecf20Sopenharmony_ci * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) 12398c2ecf20Sopenharmony_ci * 12408c2ecf20Sopenharmony_ci * (1) is not realistic because pos_ratio also takes part in balancing 12418c2ecf20Sopenharmony_ci * the dirty rate. Consider the state 12428c2ecf20Sopenharmony_ci * pos_ratio = 0.5 (3) 12438c2ecf20Sopenharmony_ci * rate = 2 * (write_bw / N) (4) 12448c2ecf20Sopenharmony_ci * If (1) is used, it will stuck in that state! Because each dd will 12458c2ecf20Sopenharmony_ci * be throttled at 12468c2ecf20Sopenharmony_ci * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) 12478c2ecf20Sopenharmony_ci * yielding 12488c2ecf20Sopenharmony_ci * dirty_rate = N * task_ratelimit = write_bw (6) 12498c2ecf20Sopenharmony_ci * put (6) into (1) we get 12508c2ecf20Sopenharmony_ci * rate_(i+1) = rate_(i) (7) 12518c2ecf20Sopenharmony_ci * 12528c2ecf20Sopenharmony_ci * So we end up using (2) to always keep 12538c2ecf20Sopenharmony_ci * rate_(i+1) ~= (write_bw / N) (8) 12548c2ecf20Sopenharmony_ci * regardless of the value of pos_ratio. As long as (8) is satisfied, 12558c2ecf20Sopenharmony_ci * pos_ratio is able to drive itself to 1.0, which is not only where 12568c2ecf20Sopenharmony_ci * the dirty count meet the setpoint, but also where the slope of 12578c2ecf20Sopenharmony_ci * pos_ratio is most flat and hence task_ratelimit is least fluctuated. 12588c2ecf20Sopenharmony_ci */ 12598c2ecf20Sopenharmony_ci balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, 12608c2ecf20Sopenharmony_ci dirty_rate | 1); 12618c2ecf20Sopenharmony_ci /* 12628c2ecf20Sopenharmony_ci * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw 12638c2ecf20Sopenharmony_ci */ 12648c2ecf20Sopenharmony_ci if (unlikely(balanced_dirty_ratelimit > write_bw)) 12658c2ecf20Sopenharmony_ci balanced_dirty_ratelimit = write_bw; 12668c2ecf20Sopenharmony_ci 12678c2ecf20Sopenharmony_ci /* 12688c2ecf20Sopenharmony_ci * We could safely do this and return immediately: 12698c2ecf20Sopenharmony_ci * 12708c2ecf20Sopenharmony_ci * wb->dirty_ratelimit = balanced_dirty_ratelimit; 12718c2ecf20Sopenharmony_ci * 12728c2ecf20Sopenharmony_ci * However to get a more stable dirty_ratelimit, the below elaborated 12738c2ecf20Sopenharmony_ci * code makes use of task_ratelimit to filter out singular points and 12748c2ecf20Sopenharmony_ci * limit the step size. 12758c2ecf20Sopenharmony_ci * 12768c2ecf20Sopenharmony_ci * The below code essentially only uses the relative value of 12778c2ecf20Sopenharmony_ci * 12788c2ecf20Sopenharmony_ci * task_ratelimit - dirty_ratelimit 12798c2ecf20Sopenharmony_ci * = (pos_ratio - 1) * dirty_ratelimit 12808c2ecf20Sopenharmony_ci * 12818c2ecf20Sopenharmony_ci * which reflects the direction and size of dirty position error. 12828c2ecf20Sopenharmony_ci */ 12838c2ecf20Sopenharmony_ci 12848c2ecf20Sopenharmony_ci /* 12858c2ecf20Sopenharmony_ci * dirty_ratelimit will follow balanced_dirty_ratelimit iff 12868c2ecf20Sopenharmony_ci * task_ratelimit is on the same side of dirty_ratelimit, too. 12878c2ecf20Sopenharmony_ci * For example, when 12888c2ecf20Sopenharmony_ci * - dirty_ratelimit > balanced_dirty_ratelimit 12898c2ecf20Sopenharmony_ci * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) 12908c2ecf20Sopenharmony_ci * lowering dirty_ratelimit will help meet both the position and rate 12918c2ecf20Sopenharmony_ci * control targets. Otherwise, don't update dirty_ratelimit if it will 12928c2ecf20Sopenharmony_ci * only help meet the rate target. After all, what the users ultimately 12938c2ecf20Sopenharmony_ci * feel and care are stable dirty rate and small position error. 12948c2ecf20Sopenharmony_ci * 12958c2ecf20Sopenharmony_ci * |task_ratelimit - dirty_ratelimit| is used to limit the step size 12968c2ecf20Sopenharmony_ci * and filter out the singular points of balanced_dirty_ratelimit. Which 12978c2ecf20Sopenharmony_ci * keeps jumping around randomly and can even leap far away at times 12988c2ecf20Sopenharmony_ci * due to the small 200ms estimation period of dirty_rate (we want to 12998c2ecf20Sopenharmony_ci * keep that period small to reduce time lags). 13008c2ecf20Sopenharmony_ci */ 13018c2ecf20Sopenharmony_ci step = 0; 13028c2ecf20Sopenharmony_ci 13038c2ecf20Sopenharmony_ci /* 13048c2ecf20Sopenharmony_ci * For strictlimit case, calculations above were based on wb counters 13058c2ecf20Sopenharmony_ci * and limits (starting from pos_ratio = wb_position_ratio() and up to 13068c2ecf20Sopenharmony_ci * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). 13078c2ecf20Sopenharmony_ci * Hence, to calculate "step" properly, we have to use wb_dirty as 13088c2ecf20Sopenharmony_ci * "dirty" and wb_setpoint as "setpoint". 13098c2ecf20Sopenharmony_ci * 13108c2ecf20Sopenharmony_ci * We rampup dirty_ratelimit forcibly if wb_dirty is low because 13118c2ecf20Sopenharmony_ci * it's possible that wb_thresh is close to zero due to inactivity 13128c2ecf20Sopenharmony_ci * of backing device. 13138c2ecf20Sopenharmony_ci */ 13148c2ecf20Sopenharmony_ci if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { 13158c2ecf20Sopenharmony_ci dirty = dtc->wb_dirty; 13168c2ecf20Sopenharmony_ci if (dtc->wb_dirty < 8) 13178c2ecf20Sopenharmony_ci setpoint = dtc->wb_dirty + 1; 13188c2ecf20Sopenharmony_ci else 13198c2ecf20Sopenharmony_ci setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; 13208c2ecf20Sopenharmony_ci } 13218c2ecf20Sopenharmony_ci 13228c2ecf20Sopenharmony_ci if (dirty < setpoint) { 13238c2ecf20Sopenharmony_ci x = min3(wb->balanced_dirty_ratelimit, 13248c2ecf20Sopenharmony_ci balanced_dirty_ratelimit, task_ratelimit); 13258c2ecf20Sopenharmony_ci if (dirty_ratelimit < x) 13268c2ecf20Sopenharmony_ci step = x - dirty_ratelimit; 13278c2ecf20Sopenharmony_ci } else { 13288c2ecf20Sopenharmony_ci x = max3(wb->balanced_dirty_ratelimit, 13298c2ecf20Sopenharmony_ci balanced_dirty_ratelimit, task_ratelimit); 13308c2ecf20Sopenharmony_ci if (dirty_ratelimit > x) 13318c2ecf20Sopenharmony_ci step = dirty_ratelimit - x; 13328c2ecf20Sopenharmony_ci } 13338c2ecf20Sopenharmony_ci 13348c2ecf20Sopenharmony_ci /* 13358c2ecf20Sopenharmony_ci * Don't pursue 100% rate matching. It's impossible since the balanced 13368c2ecf20Sopenharmony_ci * rate itself is constantly fluctuating. So decrease the track speed 13378c2ecf20Sopenharmony_ci * when it gets close to the target. Helps eliminate pointless tremors. 13388c2ecf20Sopenharmony_ci */ 13398c2ecf20Sopenharmony_ci shift = dirty_ratelimit / (2 * step + 1); 13408c2ecf20Sopenharmony_ci if (shift < BITS_PER_LONG) 13418c2ecf20Sopenharmony_ci step = DIV_ROUND_UP(step >> shift, 8); 13428c2ecf20Sopenharmony_ci else 13438c2ecf20Sopenharmony_ci step = 0; 13448c2ecf20Sopenharmony_ci 13458c2ecf20Sopenharmony_ci if (dirty_ratelimit < balanced_dirty_ratelimit) 13468c2ecf20Sopenharmony_ci dirty_ratelimit += step; 13478c2ecf20Sopenharmony_ci else 13488c2ecf20Sopenharmony_ci dirty_ratelimit -= step; 13498c2ecf20Sopenharmony_ci 13508c2ecf20Sopenharmony_ci wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); 13518c2ecf20Sopenharmony_ci wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; 13528c2ecf20Sopenharmony_ci 13538c2ecf20Sopenharmony_ci trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); 13548c2ecf20Sopenharmony_ci} 13558c2ecf20Sopenharmony_ci 13568c2ecf20Sopenharmony_cistatic void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, 13578c2ecf20Sopenharmony_ci struct dirty_throttle_control *mdtc, 13588c2ecf20Sopenharmony_ci unsigned long start_time, 13598c2ecf20Sopenharmony_ci bool update_ratelimit) 13608c2ecf20Sopenharmony_ci{ 13618c2ecf20Sopenharmony_ci struct bdi_writeback *wb = gdtc->wb; 13628c2ecf20Sopenharmony_ci unsigned long now = jiffies; 13638c2ecf20Sopenharmony_ci unsigned long elapsed = now - wb->bw_time_stamp; 13648c2ecf20Sopenharmony_ci unsigned long dirtied; 13658c2ecf20Sopenharmony_ci unsigned long written; 13668c2ecf20Sopenharmony_ci 13678c2ecf20Sopenharmony_ci lockdep_assert_held(&wb->list_lock); 13688c2ecf20Sopenharmony_ci 13698c2ecf20Sopenharmony_ci /* 13708c2ecf20Sopenharmony_ci * rate-limit, only update once every 200ms. 13718c2ecf20Sopenharmony_ci */ 13728c2ecf20Sopenharmony_ci if (elapsed < BANDWIDTH_INTERVAL) 13738c2ecf20Sopenharmony_ci return; 13748c2ecf20Sopenharmony_ci 13758c2ecf20Sopenharmony_ci dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); 13768c2ecf20Sopenharmony_ci written = percpu_counter_read(&wb->stat[WB_WRITTEN]); 13778c2ecf20Sopenharmony_ci 13788c2ecf20Sopenharmony_ci /* 13798c2ecf20Sopenharmony_ci * Skip quiet periods when disk bandwidth is under-utilized. 13808c2ecf20Sopenharmony_ci * (at least 1s idle time between two flusher runs) 13818c2ecf20Sopenharmony_ci */ 13828c2ecf20Sopenharmony_ci if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) 13838c2ecf20Sopenharmony_ci goto snapshot; 13848c2ecf20Sopenharmony_ci 13858c2ecf20Sopenharmony_ci if (update_ratelimit) { 13868c2ecf20Sopenharmony_ci domain_update_bandwidth(gdtc, now); 13878c2ecf20Sopenharmony_ci wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); 13888c2ecf20Sopenharmony_ci 13898c2ecf20Sopenharmony_ci /* 13908c2ecf20Sopenharmony_ci * @mdtc is always NULL if !CGROUP_WRITEBACK but the 13918c2ecf20Sopenharmony_ci * compiler has no way to figure that out. Help it. 13928c2ecf20Sopenharmony_ci */ 13938c2ecf20Sopenharmony_ci if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { 13948c2ecf20Sopenharmony_ci domain_update_bandwidth(mdtc, now); 13958c2ecf20Sopenharmony_ci wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); 13968c2ecf20Sopenharmony_ci } 13978c2ecf20Sopenharmony_ci } 13988c2ecf20Sopenharmony_ci wb_update_write_bandwidth(wb, elapsed, written); 13998c2ecf20Sopenharmony_ci 14008c2ecf20Sopenharmony_cisnapshot: 14018c2ecf20Sopenharmony_ci wb->dirtied_stamp = dirtied; 14028c2ecf20Sopenharmony_ci wb->written_stamp = written; 14038c2ecf20Sopenharmony_ci wb->bw_time_stamp = now; 14048c2ecf20Sopenharmony_ci} 14058c2ecf20Sopenharmony_ci 14068c2ecf20Sopenharmony_civoid wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) 14078c2ecf20Sopenharmony_ci{ 14088c2ecf20Sopenharmony_ci struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; 14098c2ecf20Sopenharmony_ci 14108c2ecf20Sopenharmony_ci __wb_update_bandwidth(&gdtc, NULL, start_time, false); 14118c2ecf20Sopenharmony_ci} 14128c2ecf20Sopenharmony_ci 14138c2ecf20Sopenharmony_ci/* 14148c2ecf20Sopenharmony_ci * After a task dirtied this many pages, balance_dirty_pages_ratelimited() 14158c2ecf20Sopenharmony_ci * will look to see if it needs to start dirty throttling. 14168c2ecf20Sopenharmony_ci * 14178c2ecf20Sopenharmony_ci * If dirty_poll_interval is too low, big NUMA machines will call the expensive 14188c2ecf20Sopenharmony_ci * global_zone_page_state() too often. So scale it near-sqrt to the safety margin 14198c2ecf20Sopenharmony_ci * (the number of pages we may dirty without exceeding the dirty limits). 14208c2ecf20Sopenharmony_ci */ 14218c2ecf20Sopenharmony_cistatic unsigned long dirty_poll_interval(unsigned long dirty, 14228c2ecf20Sopenharmony_ci unsigned long thresh) 14238c2ecf20Sopenharmony_ci{ 14248c2ecf20Sopenharmony_ci if (thresh > dirty) 14258c2ecf20Sopenharmony_ci return 1UL << (ilog2(thresh - dirty) >> 1); 14268c2ecf20Sopenharmony_ci 14278c2ecf20Sopenharmony_ci return 1; 14288c2ecf20Sopenharmony_ci} 14298c2ecf20Sopenharmony_ci 14308c2ecf20Sopenharmony_cistatic unsigned long wb_max_pause(struct bdi_writeback *wb, 14318c2ecf20Sopenharmony_ci unsigned long wb_dirty) 14328c2ecf20Sopenharmony_ci{ 14338c2ecf20Sopenharmony_ci unsigned long bw = wb->avg_write_bandwidth; 14348c2ecf20Sopenharmony_ci unsigned long t; 14358c2ecf20Sopenharmony_ci 14368c2ecf20Sopenharmony_ci /* 14378c2ecf20Sopenharmony_ci * Limit pause time for small memory systems. If sleeping for too long 14388c2ecf20Sopenharmony_ci * time, a small pool of dirty/writeback pages may go empty and disk go 14398c2ecf20Sopenharmony_ci * idle. 14408c2ecf20Sopenharmony_ci * 14418c2ecf20Sopenharmony_ci * 8 serves as the safety ratio. 14428c2ecf20Sopenharmony_ci */ 14438c2ecf20Sopenharmony_ci t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); 14448c2ecf20Sopenharmony_ci t++; 14458c2ecf20Sopenharmony_ci 14468c2ecf20Sopenharmony_ci return min_t(unsigned long, t, MAX_PAUSE); 14478c2ecf20Sopenharmony_ci} 14488c2ecf20Sopenharmony_ci 14498c2ecf20Sopenharmony_cistatic long wb_min_pause(struct bdi_writeback *wb, 14508c2ecf20Sopenharmony_ci long max_pause, 14518c2ecf20Sopenharmony_ci unsigned long task_ratelimit, 14528c2ecf20Sopenharmony_ci unsigned long dirty_ratelimit, 14538c2ecf20Sopenharmony_ci int *nr_dirtied_pause) 14548c2ecf20Sopenharmony_ci{ 14558c2ecf20Sopenharmony_ci long hi = ilog2(wb->avg_write_bandwidth); 14568c2ecf20Sopenharmony_ci long lo = ilog2(wb->dirty_ratelimit); 14578c2ecf20Sopenharmony_ci long t; /* target pause */ 14588c2ecf20Sopenharmony_ci long pause; /* estimated next pause */ 14598c2ecf20Sopenharmony_ci int pages; /* target nr_dirtied_pause */ 14608c2ecf20Sopenharmony_ci 14618c2ecf20Sopenharmony_ci /* target for 10ms pause on 1-dd case */ 14628c2ecf20Sopenharmony_ci t = max(1, HZ / 100); 14638c2ecf20Sopenharmony_ci 14648c2ecf20Sopenharmony_ci /* 14658c2ecf20Sopenharmony_ci * Scale up pause time for concurrent dirtiers in order to reduce CPU 14668c2ecf20Sopenharmony_ci * overheads. 14678c2ecf20Sopenharmony_ci * 14688c2ecf20Sopenharmony_ci * (N * 10ms) on 2^N concurrent tasks. 14698c2ecf20Sopenharmony_ci */ 14708c2ecf20Sopenharmony_ci if (hi > lo) 14718c2ecf20Sopenharmony_ci t += (hi - lo) * (10 * HZ) / 1024; 14728c2ecf20Sopenharmony_ci 14738c2ecf20Sopenharmony_ci /* 14748c2ecf20Sopenharmony_ci * This is a bit convoluted. We try to base the next nr_dirtied_pause 14758c2ecf20Sopenharmony_ci * on the much more stable dirty_ratelimit. However the next pause time 14768c2ecf20Sopenharmony_ci * will be computed based on task_ratelimit and the two rate limits may 14778c2ecf20Sopenharmony_ci * depart considerably at some time. Especially if task_ratelimit goes 14788c2ecf20Sopenharmony_ci * below dirty_ratelimit/2 and the target pause is max_pause, the next 14798c2ecf20Sopenharmony_ci * pause time will be max_pause*2 _trimmed down_ to max_pause. As a 14808c2ecf20Sopenharmony_ci * result task_ratelimit won't be executed faithfully, which could 14818c2ecf20Sopenharmony_ci * eventually bring down dirty_ratelimit. 14828c2ecf20Sopenharmony_ci * 14838c2ecf20Sopenharmony_ci * We apply two rules to fix it up: 14848c2ecf20Sopenharmony_ci * 1) try to estimate the next pause time and if necessary, use a lower 14858c2ecf20Sopenharmony_ci * nr_dirtied_pause so as not to exceed max_pause. When this happens, 14868c2ecf20Sopenharmony_ci * nr_dirtied_pause will be "dancing" with task_ratelimit. 14878c2ecf20Sopenharmony_ci * 2) limit the target pause time to max_pause/2, so that the normal 14888c2ecf20Sopenharmony_ci * small fluctuations of task_ratelimit won't trigger rule (1) and 14898c2ecf20Sopenharmony_ci * nr_dirtied_pause will remain as stable as dirty_ratelimit. 14908c2ecf20Sopenharmony_ci */ 14918c2ecf20Sopenharmony_ci t = min(t, 1 + max_pause / 2); 14928c2ecf20Sopenharmony_ci pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); 14938c2ecf20Sopenharmony_ci 14948c2ecf20Sopenharmony_ci /* 14958c2ecf20Sopenharmony_ci * Tiny nr_dirtied_pause is found to hurt I/O performance in the test 14968c2ecf20Sopenharmony_ci * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. 14978c2ecf20Sopenharmony_ci * When the 16 consecutive reads are often interrupted by some dirty 14988c2ecf20Sopenharmony_ci * throttling pause during the async writes, cfq will go into idles 14998c2ecf20Sopenharmony_ci * (deadline is fine). So push nr_dirtied_pause as high as possible 15008c2ecf20Sopenharmony_ci * until reaches DIRTY_POLL_THRESH=32 pages. 15018c2ecf20Sopenharmony_ci */ 15028c2ecf20Sopenharmony_ci if (pages < DIRTY_POLL_THRESH) { 15038c2ecf20Sopenharmony_ci t = max_pause; 15048c2ecf20Sopenharmony_ci pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); 15058c2ecf20Sopenharmony_ci if (pages > DIRTY_POLL_THRESH) { 15068c2ecf20Sopenharmony_ci pages = DIRTY_POLL_THRESH; 15078c2ecf20Sopenharmony_ci t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; 15088c2ecf20Sopenharmony_ci } 15098c2ecf20Sopenharmony_ci } 15108c2ecf20Sopenharmony_ci 15118c2ecf20Sopenharmony_ci pause = HZ * pages / (task_ratelimit + 1); 15128c2ecf20Sopenharmony_ci if (pause > max_pause) { 15138c2ecf20Sopenharmony_ci t = max_pause; 15148c2ecf20Sopenharmony_ci pages = task_ratelimit * t / roundup_pow_of_two(HZ); 15158c2ecf20Sopenharmony_ci } 15168c2ecf20Sopenharmony_ci 15178c2ecf20Sopenharmony_ci *nr_dirtied_pause = pages; 15188c2ecf20Sopenharmony_ci /* 15198c2ecf20Sopenharmony_ci * The minimal pause time will normally be half the target pause time. 15208c2ecf20Sopenharmony_ci */ 15218c2ecf20Sopenharmony_ci return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; 15228c2ecf20Sopenharmony_ci} 15238c2ecf20Sopenharmony_ci 15248c2ecf20Sopenharmony_cistatic inline void wb_dirty_limits(struct dirty_throttle_control *dtc) 15258c2ecf20Sopenharmony_ci{ 15268c2ecf20Sopenharmony_ci struct bdi_writeback *wb = dtc->wb; 15278c2ecf20Sopenharmony_ci unsigned long wb_reclaimable; 15288c2ecf20Sopenharmony_ci 15298c2ecf20Sopenharmony_ci /* 15308c2ecf20Sopenharmony_ci * wb_thresh is not treated as some limiting factor as 15318c2ecf20Sopenharmony_ci * dirty_thresh, due to reasons 15328c2ecf20Sopenharmony_ci * - in JBOD setup, wb_thresh can fluctuate a lot 15338c2ecf20Sopenharmony_ci * - in a system with HDD and USB key, the USB key may somehow 15348c2ecf20Sopenharmony_ci * go into state (wb_dirty >> wb_thresh) either because 15358c2ecf20Sopenharmony_ci * wb_dirty starts high, or because wb_thresh drops low. 15368c2ecf20Sopenharmony_ci * In this case we don't want to hard throttle the USB key 15378c2ecf20Sopenharmony_ci * dirtiers for 100 seconds until wb_dirty drops under 15388c2ecf20Sopenharmony_ci * wb_thresh. Instead the auxiliary wb control line in 15398c2ecf20Sopenharmony_ci * wb_position_ratio() will let the dirtier task progress 15408c2ecf20Sopenharmony_ci * at some rate <= (write_bw / 2) for bringing down wb_dirty. 15418c2ecf20Sopenharmony_ci */ 15428c2ecf20Sopenharmony_ci dtc->wb_thresh = __wb_calc_thresh(dtc); 15438c2ecf20Sopenharmony_ci dtc->wb_bg_thresh = dtc->thresh ? 15448c2ecf20Sopenharmony_ci div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; 15458c2ecf20Sopenharmony_ci 15468c2ecf20Sopenharmony_ci /* 15478c2ecf20Sopenharmony_ci * In order to avoid the stacked BDI deadlock we need 15488c2ecf20Sopenharmony_ci * to ensure we accurately count the 'dirty' pages when 15498c2ecf20Sopenharmony_ci * the threshold is low. 15508c2ecf20Sopenharmony_ci * 15518c2ecf20Sopenharmony_ci * Otherwise it would be possible to get thresh+n pages 15528c2ecf20Sopenharmony_ci * reported dirty, even though there are thresh-m pages 15538c2ecf20Sopenharmony_ci * actually dirty; with m+n sitting in the percpu 15548c2ecf20Sopenharmony_ci * deltas. 15558c2ecf20Sopenharmony_ci */ 15568c2ecf20Sopenharmony_ci if (dtc->wb_thresh < 2 * wb_stat_error()) { 15578c2ecf20Sopenharmony_ci wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); 15588c2ecf20Sopenharmony_ci dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); 15598c2ecf20Sopenharmony_ci } else { 15608c2ecf20Sopenharmony_ci wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); 15618c2ecf20Sopenharmony_ci dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); 15628c2ecf20Sopenharmony_ci } 15638c2ecf20Sopenharmony_ci} 15648c2ecf20Sopenharmony_ci 15658c2ecf20Sopenharmony_ci/* 15668c2ecf20Sopenharmony_ci * balance_dirty_pages() must be called by processes which are generating dirty 15678c2ecf20Sopenharmony_ci * data. It looks at the number of dirty pages in the machine and will force 15688c2ecf20Sopenharmony_ci * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. 15698c2ecf20Sopenharmony_ci * If we're over `background_thresh' then the writeback threads are woken to 15708c2ecf20Sopenharmony_ci * perform some writeout. 15718c2ecf20Sopenharmony_ci */ 15728c2ecf20Sopenharmony_cistatic void balance_dirty_pages(struct bdi_writeback *wb, 15738c2ecf20Sopenharmony_ci unsigned long pages_dirtied) 15748c2ecf20Sopenharmony_ci{ 15758c2ecf20Sopenharmony_ci struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; 15768c2ecf20Sopenharmony_ci struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; 15778c2ecf20Sopenharmony_ci struct dirty_throttle_control * const gdtc = &gdtc_stor; 15788c2ecf20Sopenharmony_ci struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? 15798c2ecf20Sopenharmony_ci &mdtc_stor : NULL; 15808c2ecf20Sopenharmony_ci struct dirty_throttle_control *sdtc; 15818c2ecf20Sopenharmony_ci unsigned long nr_reclaimable; /* = file_dirty */ 15828c2ecf20Sopenharmony_ci long period; 15838c2ecf20Sopenharmony_ci long pause; 15848c2ecf20Sopenharmony_ci long max_pause; 15858c2ecf20Sopenharmony_ci long min_pause; 15868c2ecf20Sopenharmony_ci int nr_dirtied_pause; 15878c2ecf20Sopenharmony_ci bool dirty_exceeded = false; 15888c2ecf20Sopenharmony_ci unsigned long task_ratelimit; 15898c2ecf20Sopenharmony_ci unsigned long dirty_ratelimit; 15908c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = wb->bdi; 15918c2ecf20Sopenharmony_ci bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; 15928c2ecf20Sopenharmony_ci unsigned long start_time = jiffies; 15938c2ecf20Sopenharmony_ci 15948c2ecf20Sopenharmony_ci for (;;) { 15958c2ecf20Sopenharmony_ci unsigned long now = jiffies; 15968c2ecf20Sopenharmony_ci unsigned long dirty, thresh, bg_thresh; 15978c2ecf20Sopenharmony_ci unsigned long m_dirty = 0; /* stop bogus uninit warnings */ 15988c2ecf20Sopenharmony_ci unsigned long m_thresh = 0; 15998c2ecf20Sopenharmony_ci unsigned long m_bg_thresh = 0; 16008c2ecf20Sopenharmony_ci 16018c2ecf20Sopenharmony_ci nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); 16028c2ecf20Sopenharmony_ci gdtc->avail = global_dirtyable_memory(); 16038c2ecf20Sopenharmony_ci gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); 16048c2ecf20Sopenharmony_ci 16058c2ecf20Sopenharmony_ci domain_dirty_limits(gdtc); 16068c2ecf20Sopenharmony_ci 16078c2ecf20Sopenharmony_ci if (unlikely(strictlimit)) { 16088c2ecf20Sopenharmony_ci wb_dirty_limits(gdtc); 16098c2ecf20Sopenharmony_ci 16108c2ecf20Sopenharmony_ci dirty = gdtc->wb_dirty; 16118c2ecf20Sopenharmony_ci thresh = gdtc->wb_thresh; 16128c2ecf20Sopenharmony_ci bg_thresh = gdtc->wb_bg_thresh; 16138c2ecf20Sopenharmony_ci } else { 16148c2ecf20Sopenharmony_ci dirty = gdtc->dirty; 16158c2ecf20Sopenharmony_ci thresh = gdtc->thresh; 16168c2ecf20Sopenharmony_ci bg_thresh = gdtc->bg_thresh; 16178c2ecf20Sopenharmony_ci } 16188c2ecf20Sopenharmony_ci 16198c2ecf20Sopenharmony_ci if (mdtc) { 16208c2ecf20Sopenharmony_ci unsigned long filepages, headroom, writeback; 16218c2ecf20Sopenharmony_ci 16228c2ecf20Sopenharmony_ci /* 16238c2ecf20Sopenharmony_ci * If @wb belongs to !root memcg, repeat the same 16248c2ecf20Sopenharmony_ci * basic calculations for the memcg domain. 16258c2ecf20Sopenharmony_ci */ 16268c2ecf20Sopenharmony_ci mem_cgroup_wb_stats(wb, &filepages, &headroom, 16278c2ecf20Sopenharmony_ci &mdtc->dirty, &writeback); 16288c2ecf20Sopenharmony_ci mdtc->dirty += writeback; 16298c2ecf20Sopenharmony_ci mdtc_calc_avail(mdtc, filepages, headroom); 16308c2ecf20Sopenharmony_ci 16318c2ecf20Sopenharmony_ci domain_dirty_limits(mdtc); 16328c2ecf20Sopenharmony_ci 16338c2ecf20Sopenharmony_ci if (unlikely(strictlimit)) { 16348c2ecf20Sopenharmony_ci wb_dirty_limits(mdtc); 16358c2ecf20Sopenharmony_ci m_dirty = mdtc->wb_dirty; 16368c2ecf20Sopenharmony_ci m_thresh = mdtc->wb_thresh; 16378c2ecf20Sopenharmony_ci m_bg_thresh = mdtc->wb_bg_thresh; 16388c2ecf20Sopenharmony_ci } else { 16398c2ecf20Sopenharmony_ci m_dirty = mdtc->dirty; 16408c2ecf20Sopenharmony_ci m_thresh = mdtc->thresh; 16418c2ecf20Sopenharmony_ci m_bg_thresh = mdtc->bg_thresh; 16428c2ecf20Sopenharmony_ci } 16438c2ecf20Sopenharmony_ci } 16448c2ecf20Sopenharmony_ci 16458c2ecf20Sopenharmony_ci /* 16468c2ecf20Sopenharmony_ci * Throttle it only when the background writeback cannot 16478c2ecf20Sopenharmony_ci * catch-up. This avoids (excessively) small writeouts 16488c2ecf20Sopenharmony_ci * when the wb limits are ramping up in case of !strictlimit. 16498c2ecf20Sopenharmony_ci * 16508c2ecf20Sopenharmony_ci * In strictlimit case make decision based on the wb counters 16518c2ecf20Sopenharmony_ci * and limits. Small writeouts when the wb limits are ramping 16528c2ecf20Sopenharmony_ci * up are the price we consciously pay for strictlimit-ing. 16538c2ecf20Sopenharmony_ci * 16548c2ecf20Sopenharmony_ci * If memcg domain is in effect, @dirty should be under 16558c2ecf20Sopenharmony_ci * both global and memcg freerun ceilings. 16568c2ecf20Sopenharmony_ci */ 16578c2ecf20Sopenharmony_ci if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && 16588c2ecf20Sopenharmony_ci (!mdtc || 16598c2ecf20Sopenharmony_ci m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { 16608c2ecf20Sopenharmony_ci unsigned long intv; 16618c2ecf20Sopenharmony_ci unsigned long m_intv; 16628c2ecf20Sopenharmony_ci 16638c2ecf20Sopenharmony_cifree_running: 16648c2ecf20Sopenharmony_ci intv = dirty_poll_interval(dirty, thresh); 16658c2ecf20Sopenharmony_ci m_intv = ULONG_MAX; 16668c2ecf20Sopenharmony_ci 16678c2ecf20Sopenharmony_ci current->dirty_paused_when = now; 16688c2ecf20Sopenharmony_ci current->nr_dirtied = 0; 16698c2ecf20Sopenharmony_ci if (mdtc) 16708c2ecf20Sopenharmony_ci m_intv = dirty_poll_interval(m_dirty, m_thresh); 16718c2ecf20Sopenharmony_ci current->nr_dirtied_pause = min(intv, m_intv); 16728c2ecf20Sopenharmony_ci break; 16738c2ecf20Sopenharmony_ci } 16748c2ecf20Sopenharmony_ci 16758c2ecf20Sopenharmony_ci if (unlikely(!writeback_in_progress(wb))) 16768c2ecf20Sopenharmony_ci wb_start_background_writeback(wb); 16778c2ecf20Sopenharmony_ci 16788c2ecf20Sopenharmony_ci mem_cgroup_flush_foreign(wb); 16798c2ecf20Sopenharmony_ci 16808c2ecf20Sopenharmony_ci /* 16818c2ecf20Sopenharmony_ci * Calculate global domain's pos_ratio and select the 16828c2ecf20Sopenharmony_ci * global dtc by default. 16838c2ecf20Sopenharmony_ci */ 16848c2ecf20Sopenharmony_ci if (!strictlimit) { 16858c2ecf20Sopenharmony_ci wb_dirty_limits(gdtc); 16868c2ecf20Sopenharmony_ci 16878c2ecf20Sopenharmony_ci if ((current->flags & PF_LOCAL_THROTTLE) && 16888c2ecf20Sopenharmony_ci gdtc->wb_dirty < 16898c2ecf20Sopenharmony_ci dirty_freerun_ceiling(gdtc->wb_thresh, 16908c2ecf20Sopenharmony_ci gdtc->wb_bg_thresh)) 16918c2ecf20Sopenharmony_ci /* 16928c2ecf20Sopenharmony_ci * LOCAL_THROTTLE tasks must not be throttled 16938c2ecf20Sopenharmony_ci * when below the per-wb freerun ceiling. 16948c2ecf20Sopenharmony_ci */ 16958c2ecf20Sopenharmony_ci goto free_running; 16968c2ecf20Sopenharmony_ci } 16978c2ecf20Sopenharmony_ci 16988c2ecf20Sopenharmony_ci dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && 16998c2ecf20Sopenharmony_ci ((gdtc->dirty > gdtc->thresh) || strictlimit); 17008c2ecf20Sopenharmony_ci 17018c2ecf20Sopenharmony_ci wb_position_ratio(gdtc); 17028c2ecf20Sopenharmony_ci sdtc = gdtc; 17038c2ecf20Sopenharmony_ci 17048c2ecf20Sopenharmony_ci if (mdtc) { 17058c2ecf20Sopenharmony_ci /* 17068c2ecf20Sopenharmony_ci * If memcg domain is in effect, calculate its 17078c2ecf20Sopenharmony_ci * pos_ratio. @wb should satisfy constraints from 17088c2ecf20Sopenharmony_ci * both global and memcg domains. Choose the one 17098c2ecf20Sopenharmony_ci * w/ lower pos_ratio. 17108c2ecf20Sopenharmony_ci */ 17118c2ecf20Sopenharmony_ci if (!strictlimit) { 17128c2ecf20Sopenharmony_ci wb_dirty_limits(mdtc); 17138c2ecf20Sopenharmony_ci 17148c2ecf20Sopenharmony_ci if ((current->flags & PF_LOCAL_THROTTLE) && 17158c2ecf20Sopenharmony_ci mdtc->wb_dirty < 17168c2ecf20Sopenharmony_ci dirty_freerun_ceiling(mdtc->wb_thresh, 17178c2ecf20Sopenharmony_ci mdtc->wb_bg_thresh)) 17188c2ecf20Sopenharmony_ci /* 17198c2ecf20Sopenharmony_ci * LOCAL_THROTTLE tasks must not be 17208c2ecf20Sopenharmony_ci * throttled when below the per-wb 17218c2ecf20Sopenharmony_ci * freerun ceiling. 17228c2ecf20Sopenharmony_ci */ 17238c2ecf20Sopenharmony_ci goto free_running; 17248c2ecf20Sopenharmony_ci } 17258c2ecf20Sopenharmony_ci dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && 17268c2ecf20Sopenharmony_ci ((mdtc->dirty > mdtc->thresh) || strictlimit); 17278c2ecf20Sopenharmony_ci 17288c2ecf20Sopenharmony_ci wb_position_ratio(mdtc); 17298c2ecf20Sopenharmony_ci if (mdtc->pos_ratio < gdtc->pos_ratio) 17308c2ecf20Sopenharmony_ci sdtc = mdtc; 17318c2ecf20Sopenharmony_ci } 17328c2ecf20Sopenharmony_ci 17338c2ecf20Sopenharmony_ci if (dirty_exceeded && !wb->dirty_exceeded) 17348c2ecf20Sopenharmony_ci wb->dirty_exceeded = 1; 17358c2ecf20Sopenharmony_ci 17368c2ecf20Sopenharmony_ci if (time_is_before_jiffies(wb->bw_time_stamp + 17378c2ecf20Sopenharmony_ci BANDWIDTH_INTERVAL)) { 17388c2ecf20Sopenharmony_ci spin_lock(&wb->list_lock); 17398c2ecf20Sopenharmony_ci __wb_update_bandwidth(gdtc, mdtc, start_time, true); 17408c2ecf20Sopenharmony_ci spin_unlock(&wb->list_lock); 17418c2ecf20Sopenharmony_ci } 17428c2ecf20Sopenharmony_ci 17438c2ecf20Sopenharmony_ci /* throttle according to the chosen dtc */ 17448c2ecf20Sopenharmony_ci dirty_ratelimit = wb->dirty_ratelimit; 17458c2ecf20Sopenharmony_ci task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> 17468c2ecf20Sopenharmony_ci RATELIMIT_CALC_SHIFT; 17478c2ecf20Sopenharmony_ci max_pause = wb_max_pause(wb, sdtc->wb_dirty); 17488c2ecf20Sopenharmony_ci min_pause = wb_min_pause(wb, max_pause, 17498c2ecf20Sopenharmony_ci task_ratelimit, dirty_ratelimit, 17508c2ecf20Sopenharmony_ci &nr_dirtied_pause); 17518c2ecf20Sopenharmony_ci 17528c2ecf20Sopenharmony_ci if (unlikely(task_ratelimit == 0)) { 17538c2ecf20Sopenharmony_ci period = max_pause; 17548c2ecf20Sopenharmony_ci pause = max_pause; 17558c2ecf20Sopenharmony_ci goto pause; 17568c2ecf20Sopenharmony_ci } 17578c2ecf20Sopenharmony_ci period = HZ * pages_dirtied / task_ratelimit; 17588c2ecf20Sopenharmony_ci pause = period; 17598c2ecf20Sopenharmony_ci if (current->dirty_paused_when) 17608c2ecf20Sopenharmony_ci pause -= now - current->dirty_paused_when; 17618c2ecf20Sopenharmony_ci /* 17628c2ecf20Sopenharmony_ci * For less than 1s think time (ext3/4 may block the dirtier 17638c2ecf20Sopenharmony_ci * for up to 800ms from time to time on 1-HDD; so does xfs, 17648c2ecf20Sopenharmony_ci * however at much less frequency), try to compensate it in 17658c2ecf20Sopenharmony_ci * future periods by updating the virtual time; otherwise just 17668c2ecf20Sopenharmony_ci * do a reset, as it may be a light dirtier. 17678c2ecf20Sopenharmony_ci */ 17688c2ecf20Sopenharmony_ci if (pause < min_pause) { 17698c2ecf20Sopenharmony_ci trace_balance_dirty_pages(wb, 17708c2ecf20Sopenharmony_ci sdtc->thresh, 17718c2ecf20Sopenharmony_ci sdtc->bg_thresh, 17728c2ecf20Sopenharmony_ci sdtc->dirty, 17738c2ecf20Sopenharmony_ci sdtc->wb_thresh, 17748c2ecf20Sopenharmony_ci sdtc->wb_dirty, 17758c2ecf20Sopenharmony_ci dirty_ratelimit, 17768c2ecf20Sopenharmony_ci task_ratelimit, 17778c2ecf20Sopenharmony_ci pages_dirtied, 17788c2ecf20Sopenharmony_ci period, 17798c2ecf20Sopenharmony_ci min(pause, 0L), 17808c2ecf20Sopenharmony_ci start_time); 17818c2ecf20Sopenharmony_ci if (pause < -HZ) { 17828c2ecf20Sopenharmony_ci current->dirty_paused_when = now; 17838c2ecf20Sopenharmony_ci current->nr_dirtied = 0; 17848c2ecf20Sopenharmony_ci } else if (period) { 17858c2ecf20Sopenharmony_ci current->dirty_paused_when += period; 17868c2ecf20Sopenharmony_ci current->nr_dirtied = 0; 17878c2ecf20Sopenharmony_ci } else if (current->nr_dirtied_pause <= pages_dirtied) 17888c2ecf20Sopenharmony_ci current->nr_dirtied_pause += pages_dirtied; 17898c2ecf20Sopenharmony_ci break; 17908c2ecf20Sopenharmony_ci } 17918c2ecf20Sopenharmony_ci if (unlikely(pause > max_pause)) { 17928c2ecf20Sopenharmony_ci /* for occasional dropped task_ratelimit */ 17938c2ecf20Sopenharmony_ci now += min(pause - max_pause, max_pause); 17948c2ecf20Sopenharmony_ci pause = max_pause; 17958c2ecf20Sopenharmony_ci } 17968c2ecf20Sopenharmony_ci 17978c2ecf20Sopenharmony_cipause: 17988c2ecf20Sopenharmony_ci trace_balance_dirty_pages(wb, 17998c2ecf20Sopenharmony_ci sdtc->thresh, 18008c2ecf20Sopenharmony_ci sdtc->bg_thresh, 18018c2ecf20Sopenharmony_ci sdtc->dirty, 18028c2ecf20Sopenharmony_ci sdtc->wb_thresh, 18038c2ecf20Sopenharmony_ci sdtc->wb_dirty, 18048c2ecf20Sopenharmony_ci dirty_ratelimit, 18058c2ecf20Sopenharmony_ci task_ratelimit, 18068c2ecf20Sopenharmony_ci pages_dirtied, 18078c2ecf20Sopenharmony_ci period, 18088c2ecf20Sopenharmony_ci pause, 18098c2ecf20Sopenharmony_ci start_time); 18108c2ecf20Sopenharmony_ci __set_current_state(TASK_KILLABLE); 18118c2ecf20Sopenharmony_ci wb->dirty_sleep = now; 18128c2ecf20Sopenharmony_ci io_schedule_timeout(pause); 18138c2ecf20Sopenharmony_ci 18148c2ecf20Sopenharmony_ci current->dirty_paused_when = now + pause; 18158c2ecf20Sopenharmony_ci current->nr_dirtied = 0; 18168c2ecf20Sopenharmony_ci current->nr_dirtied_pause = nr_dirtied_pause; 18178c2ecf20Sopenharmony_ci 18188c2ecf20Sopenharmony_ci /* 18198c2ecf20Sopenharmony_ci * This is typically equal to (dirty < thresh) and can also 18208c2ecf20Sopenharmony_ci * keep "1000+ dd on a slow USB stick" under control. 18218c2ecf20Sopenharmony_ci */ 18228c2ecf20Sopenharmony_ci if (task_ratelimit) 18238c2ecf20Sopenharmony_ci break; 18248c2ecf20Sopenharmony_ci 18258c2ecf20Sopenharmony_ci /* 18268c2ecf20Sopenharmony_ci * In the case of an unresponding NFS server and the NFS dirty 18278c2ecf20Sopenharmony_ci * pages exceeds dirty_thresh, give the other good wb's a pipe 18288c2ecf20Sopenharmony_ci * to go through, so that tasks on them still remain responsive. 18298c2ecf20Sopenharmony_ci * 18308c2ecf20Sopenharmony_ci * In theory 1 page is enough to keep the consumer-producer 18318c2ecf20Sopenharmony_ci * pipe going: the flusher cleans 1 page => the task dirties 1 18328c2ecf20Sopenharmony_ci * more page. However wb_dirty has accounting errors. So use 18338c2ecf20Sopenharmony_ci * the larger and more IO friendly wb_stat_error. 18348c2ecf20Sopenharmony_ci */ 18358c2ecf20Sopenharmony_ci if (sdtc->wb_dirty <= wb_stat_error()) 18368c2ecf20Sopenharmony_ci break; 18378c2ecf20Sopenharmony_ci 18388c2ecf20Sopenharmony_ci if (fatal_signal_pending(current)) 18398c2ecf20Sopenharmony_ci break; 18408c2ecf20Sopenharmony_ci } 18418c2ecf20Sopenharmony_ci 18428c2ecf20Sopenharmony_ci if (!dirty_exceeded && wb->dirty_exceeded) 18438c2ecf20Sopenharmony_ci wb->dirty_exceeded = 0; 18448c2ecf20Sopenharmony_ci 18458c2ecf20Sopenharmony_ci if (writeback_in_progress(wb)) 18468c2ecf20Sopenharmony_ci return; 18478c2ecf20Sopenharmony_ci 18488c2ecf20Sopenharmony_ci /* 18498c2ecf20Sopenharmony_ci * In laptop mode, we wait until hitting the higher threshold before 18508c2ecf20Sopenharmony_ci * starting background writeout, and then write out all the way down 18518c2ecf20Sopenharmony_ci * to the lower threshold. So slow writers cause minimal disk activity. 18528c2ecf20Sopenharmony_ci * 18538c2ecf20Sopenharmony_ci * In normal mode, we start background writeout at the lower 18548c2ecf20Sopenharmony_ci * background_thresh, to keep the amount of dirty memory low. 18558c2ecf20Sopenharmony_ci */ 18568c2ecf20Sopenharmony_ci if (laptop_mode) 18578c2ecf20Sopenharmony_ci return; 18588c2ecf20Sopenharmony_ci 18598c2ecf20Sopenharmony_ci if (nr_reclaimable > gdtc->bg_thresh) 18608c2ecf20Sopenharmony_ci wb_start_background_writeback(wb); 18618c2ecf20Sopenharmony_ci} 18628c2ecf20Sopenharmony_ci 18638c2ecf20Sopenharmony_cistatic DEFINE_PER_CPU(int, bdp_ratelimits); 18648c2ecf20Sopenharmony_ci 18658c2ecf20Sopenharmony_ci/* 18668c2ecf20Sopenharmony_ci * Normal tasks are throttled by 18678c2ecf20Sopenharmony_ci * loop { 18688c2ecf20Sopenharmony_ci * dirty tsk->nr_dirtied_pause pages; 18698c2ecf20Sopenharmony_ci * take a snap in balance_dirty_pages(); 18708c2ecf20Sopenharmony_ci * } 18718c2ecf20Sopenharmony_ci * However there is a worst case. If every task exit immediately when dirtied 18728c2ecf20Sopenharmony_ci * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be 18738c2ecf20Sopenharmony_ci * called to throttle the page dirties. The solution is to save the not yet 18748c2ecf20Sopenharmony_ci * throttled page dirties in dirty_throttle_leaks on task exit and charge them 18758c2ecf20Sopenharmony_ci * randomly into the running tasks. This works well for the above worst case, 18768c2ecf20Sopenharmony_ci * as the new task will pick up and accumulate the old task's leaked dirty 18778c2ecf20Sopenharmony_ci * count and eventually get throttled. 18788c2ecf20Sopenharmony_ci */ 18798c2ecf20Sopenharmony_ciDEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; 18808c2ecf20Sopenharmony_ci 18818c2ecf20Sopenharmony_ci/** 18828c2ecf20Sopenharmony_ci * balance_dirty_pages_ratelimited - balance dirty memory state 18838c2ecf20Sopenharmony_ci * @mapping: address_space which was dirtied 18848c2ecf20Sopenharmony_ci * 18858c2ecf20Sopenharmony_ci * Processes which are dirtying memory should call in here once for each page 18868c2ecf20Sopenharmony_ci * which was newly dirtied. The function will periodically check the system's 18878c2ecf20Sopenharmony_ci * dirty state and will initiate writeback if needed. 18888c2ecf20Sopenharmony_ci * 18898c2ecf20Sopenharmony_ci * On really big machines, get_writeback_state is expensive, so try to avoid 18908c2ecf20Sopenharmony_ci * calling it too often (ratelimiting). But once we're over the dirty memory 18918c2ecf20Sopenharmony_ci * limit we decrease the ratelimiting by a lot, to prevent individual processes 18928c2ecf20Sopenharmony_ci * from overshooting the limit by (ratelimit_pages) each. 18938c2ecf20Sopenharmony_ci */ 18948c2ecf20Sopenharmony_civoid balance_dirty_pages_ratelimited(struct address_space *mapping) 18958c2ecf20Sopenharmony_ci{ 18968c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 18978c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(inode); 18988c2ecf20Sopenharmony_ci struct bdi_writeback *wb = NULL; 18998c2ecf20Sopenharmony_ci int ratelimit; 19008c2ecf20Sopenharmony_ci int *p; 19018c2ecf20Sopenharmony_ci 19028c2ecf20Sopenharmony_ci if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) 19038c2ecf20Sopenharmony_ci return; 19048c2ecf20Sopenharmony_ci 19058c2ecf20Sopenharmony_ci if (inode_cgwb_enabled(inode)) 19068c2ecf20Sopenharmony_ci wb = wb_get_create_current(bdi, GFP_KERNEL); 19078c2ecf20Sopenharmony_ci if (!wb) 19088c2ecf20Sopenharmony_ci wb = &bdi->wb; 19098c2ecf20Sopenharmony_ci 19108c2ecf20Sopenharmony_ci ratelimit = current->nr_dirtied_pause; 19118c2ecf20Sopenharmony_ci if (wb->dirty_exceeded) 19128c2ecf20Sopenharmony_ci ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); 19138c2ecf20Sopenharmony_ci 19148c2ecf20Sopenharmony_ci preempt_disable(); 19158c2ecf20Sopenharmony_ci /* 19168c2ecf20Sopenharmony_ci * This prevents one CPU to accumulate too many dirtied pages without 19178c2ecf20Sopenharmony_ci * calling into balance_dirty_pages(), which can happen when there are 19188c2ecf20Sopenharmony_ci * 1000+ tasks, all of them start dirtying pages at exactly the same 19198c2ecf20Sopenharmony_ci * time, hence all honoured too large initial task->nr_dirtied_pause. 19208c2ecf20Sopenharmony_ci */ 19218c2ecf20Sopenharmony_ci p = this_cpu_ptr(&bdp_ratelimits); 19228c2ecf20Sopenharmony_ci if (unlikely(current->nr_dirtied >= ratelimit)) 19238c2ecf20Sopenharmony_ci *p = 0; 19248c2ecf20Sopenharmony_ci else if (unlikely(*p >= ratelimit_pages)) { 19258c2ecf20Sopenharmony_ci *p = 0; 19268c2ecf20Sopenharmony_ci ratelimit = 0; 19278c2ecf20Sopenharmony_ci } 19288c2ecf20Sopenharmony_ci /* 19298c2ecf20Sopenharmony_ci * Pick up the dirtied pages by the exited tasks. This avoids lots of 19308c2ecf20Sopenharmony_ci * short-lived tasks (eg. gcc invocations in a kernel build) escaping 19318c2ecf20Sopenharmony_ci * the dirty throttling and livelock other long-run dirtiers. 19328c2ecf20Sopenharmony_ci */ 19338c2ecf20Sopenharmony_ci p = this_cpu_ptr(&dirty_throttle_leaks); 19348c2ecf20Sopenharmony_ci if (*p > 0 && current->nr_dirtied < ratelimit) { 19358c2ecf20Sopenharmony_ci unsigned long nr_pages_dirtied; 19368c2ecf20Sopenharmony_ci nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 19378c2ecf20Sopenharmony_ci *p -= nr_pages_dirtied; 19388c2ecf20Sopenharmony_ci current->nr_dirtied += nr_pages_dirtied; 19398c2ecf20Sopenharmony_ci } 19408c2ecf20Sopenharmony_ci preempt_enable(); 19418c2ecf20Sopenharmony_ci 19428c2ecf20Sopenharmony_ci if (unlikely(current->nr_dirtied >= ratelimit)) 19438c2ecf20Sopenharmony_ci balance_dirty_pages(wb, current->nr_dirtied); 19448c2ecf20Sopenharmony_ci 19458c2ecf20Sopenharmony_ci wb_put(wb); 19468c2ecf20Sopenharmony_ci} 19478c2ecf20Sopenharmony_ciEXPORT_SYMBOL(balance_dirty_pages_ratelimited); 19488c2ecf20Sopenharmony_ci 19498c2ecf20Sopenharmony_ci/** 19508c2ecf20Sopenharmony_ci * wb_over_bg_thresh - does @wb need to be written back? 19518c2ecf20Sopenharmony_ci * @wb: bdi_writeback of interest 19528c2ecf20Sopenharmony_ci * 19538c2ecf20Sopenharmony_ci * Determines whether background writeback should keep writing @wb or it's 19548c2ecf20Sopenharmony_ci * clean enough. 19558c2ecf20Sopenharmony_ci * 19568c2ecf20Sopenharmony_ci * Return: %true if writeback should continue. 19578c2ecf20Sopenharmony_ci */ 19588c2ecf20Sopenharmony_cibool wb_over_bg_thresh(struct bdi_writeback *wb) 19598c2ecf20Sopenharmony_ci{ 19608c2ecf20Sopenharmony_ci struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; 19618c2ecf20Sopenharmony_ci struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; 19628c2ecf20Sopenharmony_ci struct dirty_throttle_control * const gdtc = &gdtc_stor; 19638c2ecf20Sopenharmony_ci struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? 19648c2ecf20Sopenharmony_ci &mdtc_stor : NULL; 19658c2ecf20Sopenharmony_ci 19668c2ecf20Sopenharmony_ci /* 19678c2ecf20Sopenharmony_ci * Similar to balance_dirty_pages() but ignores pages being written 19688c2ecf20Sopenharmony_ci * as we're trying to decide whether to put more under writeback. 19698c2ecf20Sopenharmony_ci */ 19708c2ecf20Sopenharmony_ci gdtc->avail = global_dirtyable_memory(); 19718c2ecf20Sopenharmony_ci gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); 19728c2ecf20Sopenharmony_ci domain_dirty_limits(gdtc); 19738c2ecf20Sopenharmony_ci 19748c2ecf20Sopenharmony_ci if (gdtc->dirty > gdtc->bg_thresh) 19758c2ecf20Sopenharmony_ci return true; 19768c2ecf20Sopenharmony_ci 19778c2ecf20Sopenharmony_ci if (wb_stat(wb, WB_RECLAIMABLE) > 19788c2ecf20Sopenharmony_ci wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) 19798c2ecf20Sopenharmony_ci return true; 19808c2ecf20Sopenharmony_ci 19818c2ecf20Sopenharmony_ci if (mdtc) { 19828c2ecf20Sopenharmony_ci unsigned long filepages, headroom, writeback; 19838c2ecf20Sopenharmony_ci 19848c2ecf20Sopenharmony_ci mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, 19858c2ecf20Sopenharmony_ci &writeback); 19868c2ecf20Sopenharmony_ci mdtc_calc_avail(mdtc, filepages, headroom); 19878c2ecf20Sopenharmony_ci domain_dirty_limits(mdtc); /* ditto, ignore writeback */ 19888c2ecf20Sopenharmony_ci 19898c2ecf20Sopenharmony_ci if (mdtc->dirty > mdtc->bg_thresh) 19908c2ecf20Sopenharmony_ci return true; 19918c2ecf20Sopenharmony_ci 19928c2ecf20Sopenharmony_ci if (wb_stat(wb, WB_RECLAIMABLE) > 19938c2ecf20Sopenharmony_ci wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) 19948c2ecf20Sopenharmony_ci return true; 19958c2ecf20Sopenharmony_ci } 19968c2ecf20Sopenharmony_ci 19978c2ecf20Sopenharmony_ci return false; 19988c2ecf20Sopenharmony_ci} 19998c2ecf20Sopenharmony_ci 20008c2ecf20Sopenharmony_ci/* 20018c2ecf20Sopenharmony_ci * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 20028c2ecf20Sopenharmony_ci */ 20038c2ecf20Sopenharmony_ciint dirty_writeback_centisecs_handler(struct ctl_table *table, int write, 20048c2ecf20Sopenharmony_ci void *buffer, size_t *length, loff_t *ppos) 20058c2ecf20Sopenharmony_ci{ 20068c2ecf20Sopenharmony_ci unsigned int old_interval = dirty_writeback_interval; 20078c2ecf20Sopenharmony_ci int ret; 20088c2ecf20Sopenharmony_ci 20098c2ecf20Sopenharmony_ci ret = proc_dointvec(table, write, buffer, length, ppos); 20108c2ecf20Sopenharmony_ci 20118c2ecf20Sopenharmony_ci /* 20128c2ecf20Sopenharmony_ci * Writing 0 to dirty_writeback_interval will disable periodic writeback 20138c2ecf20Sopenharmony_ci * and a different non-zero value will wakeup the writeback threads. 20148c2ecf20Sopenharmony_ci * wb_wakeup_delayed() would be more appropriate, but it's a pain to 20158c2ecf20Sopenharmony_ci * iterate over all bdis and wbs. 20168c2ecf20Sopenharmony_ci * The reason we do this is to make the change take effect immediately. 20178c2ecf20Sopenharmony_ci */ 20188c2ecf20Sopenharmony_ci if (!ret && write && dirty_writeback_interval && 20198c2ecf20Sopenharmony_ci dirty_writeback_interval != old_interval) 20208c2ecf20Sopenharmony_ci wakeup_flusher_threads(WB_REASON_PERIODIC); 20218c2ecf20Sopenharmony_ci 20228c2ecf20Sopenharmony_ci return ret; 20238c2ecf20Sopenharmony_ci} 20248c2ecf20Sopenharmony_ci 20258c2ecf20Sopenharmony_ci#ifdef CONFIG_BLOCK 20268c2ecf20Sopenharmony_civoid laptop_mode_timer_fn(struct timer_list *t) 20278c2ecf20Sopenharmony_ci{ 20288c2ecf20Sopenharmony_ci struct backing_dev_info *backing_dev_info = 20298c2ecf20Sopenharmony_ci from_timer(backing_dev_info, t, laptop_mode_wb_timer); 20308c2ecf20Sopenharmony_ci 20318c2ecf20Sopenharmony_ci wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); 20328c2ecf20Sopenharmony_ci} 20338c2ecf20Sopenharmony_ci 20348c2ecf20Sopenharmony_ci/* 20358c2ecf20Sopenharmony_ci * We've spun up the disk and we're in laptop mode: schedule writeback 20368c2ecf20Sopenharmony_ci * of all dirty data a few seconds from now. If the flush is already scheduled 20378c2ecf20Sopenharmony_ci * then push it back - the user is still using the disk. 20388c2ecf20Sopenharmony_ci */ 20398c2ecf20Sopenharmony_civoid laptop_io_completion(struct backing_dev_info *info) 20408c2ecf20Sopenharmony_ci{ 20418c2ecf20Sopenharmony_ci mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); 20428c2ecf20Sopenharmony_ci} 20438c2ecf20Sopenharmony_ci 20448c2ecf20Sopenharmony_ci/* 20458c2ecf20Sopenharmony_ci * We're in laptop mode and we've just synced. The sync's writes will have 20468c2ecf20Sopenharmony_ci * caused another writeback to be scheduled by laptop_io_completion. 20478c2ecf20Sopenharmony_ci * Nothing needs to be written back anymore, so we unschedule the writeback. 20488c2ecf20Sopenharmony_ci */ 20498c2ecf20Sopenharmony_civoid laptop_sync_completion(void) 20508c2ecf20Sopenharmony_ci{ 20518c2ecf20Sopenharmony_ci struct backing_dev_info *bdi; 20528c2ecf20Sopenharmony_ci 20538c2ecf20Sopenharmony_ci rcu_read_lock(); 20548c2ecf20Sopenharmony_ci 20558c2ecf20Sopenharmony_ci list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) 20568c2ecf20Sopenharmony_ci del_timer(&bdi->laptop_mode_wb_timer); 20578c2ecf20Sopenharmony_ci 20588c2ecf20Sopenharmony_ci rcu_read_unlock(); 20598c2ecf20Sopenharmony_ci} 20608c2ecf20Sopenharmony_ci#endif 20618c2ecf20Sopenharmony_ci 20628c2ecf20Sopenharmony_ci/* 20638c2ecf20Sopenharmony_ci * If ratelimit_pages is too high then we can get into dirty-data overload 20648c2ecf20Sopenharmony_ci * if a large number of processes all perform writes at the same time. 20658c2ecf20Sopenharmony_ci * If it is too low then SMP machines will call the (expensive) 20668c2ecf20Sopenharmony_ci * get_writeback_state too often. 20678c2ecf20Sopenharmony_ci * 20688c2ecf20Sopenharmony_ci * Here we set ratelimit_pages to a level which ensures that when all CPUs are 20698c2ecf20Sopenharmony_ci * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory 20708c2ecf20Sopenharmony_ci * thresholds. 20718c2ecf20Sopenharmony_ci */ 20728c2ecf20Sopenharmony_ci 20738c2ecf20Sopenharmony_civoid writeback_set_ratelimit(void) 20748c2ecf20Sopenharmony_ci{ 20758c2ecf20Sopenharmony_ci struct wb_domain *dom = &global_wb_domain; 20768c2ecf20Sopenharmony_ci unsigned long background_thresh; 20778c2ecf20Sopenharmony_ci unsigned long dirty_thresh; 20788c2ecf20Sopenharmony_ci 20798c2ecf20Sopenharmony_ci global_dirty_limits(&background_thresh, &dirty_thresh); 20808c2ecf20Sopenharmony_ci dom->dirty_limit = dirty_thresh; 20818c2ecf20Sopenharmony_ci ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); 20828c2ecf20Sopenharmony_ci if (ratelimit_pages < 16) 20838c2ecf20Sopenharmony_ci ratelimit_pages = 16; 20848c2ecf20Sopenharmony_ci} 20858c2ecf20Sopenharmony_ci 20868c2ecf20Sopenharmony_cistatic int page_writeback_cpu_online(unsigned int cpu) 20878c2ecf20Sopenharmony_ci{ 20888c2ecf20Sopenharmony_ci writeback_set_ratelimit(); 20898c2ecf20Sopenharmony_ci return 0; 20908c2ecf20Sopenharmony_ci} 20918c2ecf20Sopenharmony_ci 20928c2ecf20Sopenharmony_ci/* 20938c2ecf20Sopenharmony_ci * Called early on to tune the page writeback dirty limits. 20948c2ecf20Sopenharmony_ci * 20958c2ecf20Sopenharmony_ci * We used to scale dirty pages according to how total memory 20968c2ecf20Sopenharmony_ci * related to pages that could be allocated for buffers. 20978c2ecf20Sopenharmony_ci * 20988c2ecf20Sopenharmony_ci * However, that was when we used "dirty_ratio" to scale with 20998c2ecf20Sopenharmony_ci * all memory, and we don't do that any more. "dirty_ratio" 21008c2ecf20Sopenharmony_ci * is now applied to total non-HIGHPAGE memory, and as such we can't 21018c2ecf20Sopenharmony_ci * get into the old insane situation any more where we had 21028c2ecf20Sopenharmony_ci * large amounts of dirty pages compared to a small amount of 21038c2ecf20Sopenharmony_ci * non-HIGHMEM memory. 21048c2ecf20Sopenharmony_ci * 21058c2ecf20Sopenharmony_ci * But we might still want to scale the dirty_ratio by how 21068c2ecf20Sopenharmony_ci * much memory the box has.. 21078c2ecf20Sopenharmony_ci */ 21088c2ecf20Sopenharmony_civoid __init page_writeback_init(void) 21098c2ecf20Sopenharmony_ci{ 21108c2ecf20Sopenharmony_ci BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); 21118c2ecf20Sopenharmony_ci 21128c2ecf20Sopenharmony_ci cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", 21138c2ecf20Sopenharmony_ci page_writeback_cpu_online, NULL); 21148c2ecf20Sopenharmony_ci cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, 21158c2ecf20Sopenharmony_ci page_writeback_cpu_online); 21168c2ecf20Sopenharmony_ci} 21178c2ecf20Sopenharmony_ci 21188c2ecf20Sopenharmony_ci/** 21198c2ecf20Sopenharmony_ci * tag_pages_for_writeback - tag pages to be written by write_cache_pages 21208c2ecf20Sopenharmony_ci * @mapping: address space structure to write 21218c2ecf20Sopenharmony_ci * @start: starting page index 21228c2ecf20Sopenharmony_ci * @end: ending page index (inclusive) 21238c2ecf20Sopenharmony_ci * 21248c2ecf20Sopenharmony_ci * This function scans the page range from @start to @end (inclusive) and tags 21258c2ecf20Sopenharmony_ci * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is 21268c2ecf20Sopenharmony_ci * that write_cache_pages (or whoever calls this function) will then use 21278c2ecf20Sopenharmony_ci * TOWRITE tag to identify pages eligible for writeback. This mechanism is 21288c2ecf20Sopenharmony_ci * used to avoid livelocking of writeback by a process steadily creating new 21298c2ecf20Sopenharmony_ci * dirty pages in the file (thus it is important for this function to be quick 21308c2ecf20Sopenharmony_ci * so that it can tag pages faster than a dirtying process can create them). 21318c2ecf20Sopenharmony_ci */ 21328c2ecf20Sopenharmony_civoid tag_pages_for_writeback(struct address_space *mapping, 21338c2ecf20Sopenharmony_ci pgoff_t start, pgoff_t end) 21348c2ecf20Sopenharmony_ci{ 21358c2ecf20Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, start); 21368c2ecf20Sopenharmony_ci unsigned int tagged = 0; 21378c2ecf20Sopenharmony_ci void *page; 21388c2ecf20Sopenharmony_ci 21398c2ecf20Sopenharmony_ci xas_lock_irq(&xas); 21408c2ecf20Sopenharmony_ci xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { 21418c2ecf20Sopenharmony_ci xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); 21428c2ecf20Sopenharmony_ci if (++tagged % XA_CHECK_SCHED) 21438c2ecf20Sopenharmony_ci continue; 21448c2ecf20Sopenharmony_ci 21458c2ecf20Sopenharmony_ci xas_pause(&xas); 21468c2ecf20Sopenharmony_ci xas_unlock_irq(&xas); 21478c2ecf20Sopenharmony_ci cond_resched(); 21488c2ecf20Sopenharmony_ci xas_lock_irq(&xas); 21498c2ecf20Sopenharmony_ci } 21508c2ecf20Sopenharmony_ci xas_unlock_irq(&xas); 21518c2ecf20Sopenharmony_ci} 21528c2ecf20Sopenharmony_ciEXPORT_SYMBOL(tag_pages_for_writeback); 21538c2ecf20Sopenharmony_ci 21548c2ecf20Sopenharmony_ci/** 21558c2ecf20Sopenharmony_ci * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 21568c2ecf20Sopenharmony_ci * @mapping: address space structure to write 21578c2ecf20Sopenharmony_ci * @wbc: subtract the number of written pages from *@wbc->nr_to_write 21588c2ecf20Sopenharmony_ci * @writepage: function called for each page 21598c2ecf20Sopenharmony_ci * @data: data passed to writepage function 21608c2ecf20Sopenharmony_ci * 21618c2ecf20Sopenharmony_ci * If a page is already under I/O, write_cache_pages() skips it, even 21628c2ecf20Sopenharmony_ci * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 21638c2ecf20Sopenharmony_ci * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 21648c2ecf20Sopenharmony_ci * and msync() need to guarantee that all the data which was dirty at the time 21658c2ecf20Sopenharmony_ci * the call was made get new I/O started against them. If wbc->sync_mode is 21668c2ecf20Sopenharmony_ci * WB_SYNC_ALL then we were called for data integrity and we must wait for 21678c2ecf20Sopenharmony_ci * existing IO to complete. 21688c2ecf20Sopenharmony_ci * 21698c2ecf20Sopenharmony_ci * To avoid livelocks (when other process dirties new pages), we first tag 21708c2ecf20Sopenharmony_ci * pages which should be written back with TOWRITE tag and only then start 21718c2ecf20Sopenharmony_ci * writing them. For data-integrity sync we have to be careful so that we do 21728c2ecf20Sopenharmony_ci * not miss some pages (e.g., because some other process has cleared TOWRITE 21738c2ecf20Sopenharmony_ci * tag we set). The rule we follow is that TOWRITE tag can be cleared only 21748c2ecf20Sopenharmony_ci * by the process clearing the DIRTY tag (and submitting the page for IO). 21758c2ecf20Sopenharmony_ci * 21768c2ecf20Sopenharmony_ci * To avoid deadlocks between range_cyclic writeback and callers that hold 21778c2ecf20Sopenharmony_ci * pages in PageWriteback to aggregate IO until write_cache_pages() returns, 21788c2ecf20Sopenharmony_ci * we do not loop back to the start of the file. Doing so causes a page 21798c2ecf20Sopenharmony_ci * lock/page writeback access order inversion - we should only ever lock 21808c2ecf20Sopenharmony_ci * multiple pages in ascending page->index order, and looping back to the start 21818c2ecf20Sopenharmony_ci * of the file violates that rule and causes deadlocks. 21828c2ecf20Sopenharmony_ci * 21838c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise 21848c2ecf20Sopenharmony_ci */ 21858c2ecf20Sopenharmony_ciint write_cache_pages(struct address_space *mapping, 21868c2ecf20Sopenharmony_ci struct writeback_control *wbc, writepage_t writepage, 21878c2ecf20Sopenharmony_ci void *data) 21888c2ecf20Sopenharmony_ci{ 21898c2ecf20Sopenharmony_ci int ret = 0; 21908c2ecf20Sopenharmony_ci int done = 0; 21918c2ecf20Sopenharmony_ci int error; 21928c2ecf20Sopenharmony_ci struct pagevec pvec; 21938c2ecf20Sopenharmony_ci int nr_pages; 21948c2ecf20Sopenharmony_ci pgoff_t index; 21958c2ecf20Sopenharmony_ci pgoff_t end; /* Inclusive */ 21968c2ecf20Sopenharmony_ci pgoff_t done_index; 21978c2ecf20Sopenharmony_ci int range_whole = 0; 21988c2ecf20Sopenharmony_ci xa_mark_t tag; 21998c2ecf20Sopenharmony_ci 22008c2ecf20Sopenharmony_ci pagevec_init(&pvec); 22018c2ecf20Sopenharmony_ci if (wbc->range_cyclic) { 22028c2ecf20Sopenharmony_ci index = mapping->writeback_index; /* prev offset */ 22038c2ecf20Sopenharmony_ci end = -1; 22048c2ecf20Sopenharmony_ci } else { 22058c2ecf20Sopenharmony_ci index = wbc->range_start >> PAGE_SHIFT; 22068c2ecf20Sopenharmony_ci end = wbc->range_end >> PAGE_SHIFT; 22078c2ecf20Sopenharmony_ci if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 22088c2ecf20Sopenharmony_ci range_whole = 1; 22098c2ecf20Sopenharmony_ci } 22108c2ecf20Sopenharmony_ci if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { 22118c2ecf20Sopenharmony_ci tag_pages_for_writeback(mapping, index, end); 22128c2ecf20Sopenharmony_ci tag = PAGECACHE_TAG_TOWRITE; 22138c2ecf20Sopenharmony_ci } else { 22148c2ecf20Sopenharmony_ci tag = PAGECACHE_TAG_DIRTY; 22158c2ecf20Sopenharmony_ci } 22168c2ecf20Sopenharmony_ci done_index = index; 22178c2ecf20Sopenharmony_ci while (!done && (index <= end)) { 22188c2ecf20Sopenharmony_ci int i; 22198c2ecf20Sopenharmony_ci 22208c2ecf20Sopenharmony_ci nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 22218c2ecf20Sopenharmony_ci tag); 22228c2ecf20Sopenharmony_ci if (nr_pages == 0) 22238c2ecf20Sopenharmony_ci break; 22248c2ecf20Sopenharmony_ci 22258c2ecf20Sopenharmony_ci for (i = 0; i < nr_pages; i++) { 22268c2ecf20Sopenharmony_ci struct page *page = pvec.pages[i]; 22278c2ecf20Sopenharmony_ci 22288c2ecf20Sopenharmony_ci done_index = page->index; 22298c2ecf20Sopenharmony_ci 22308c2ecf20Sopenharmony_ci lock_page(page); 22318c2ecf20Sopenharmony_ci 22328c2ecf20Sopenharmony_ci /* 22338c2ecf20Sopenharmony_ci * Page truncated or invalidated. We can freely skip it 22348c2ecf20Sopenharmony_ci * then, even for data integrity operations: the page 22358c2ecf20Sopenharmony_ci * has disappeared concurrently, so there could be no 22368c2ecf20Sopenharmony_ci * real expectation of this data interity operation 22378c2ecf20Sopenharmony_ci * even if there is now a new, dirty page at the same 22388c2ecf20Sopenharmony_ci * pagecache address. 22398c2ecf20Sopenharmony_ci */ 22408c2ecf20Sopenharmony_ci if (unlikely(page->mapping != mapping)) { 22418c2ecf20Sopenharmony_cicontinue_unlock: 22428c2ecf20Sopenharmony_ci unlock_page(page); 22438c2ecf20Sopenharmony_ci continue; 22448c2ecf20Sopenharmony_ci } 22458c2ecf20Sopenharmony_ci 22468c2ecf20Sopenharmony_ci if (!PageDirty(page)) { 22478c2ecf20Sopenharmony_ci /* someone wrote it for us */ 22488c2ecf20Sopenharmony_ci goto continue_unlock; 22498c2ecf20Sopenharmony_ci } 22508c2ecf20Sopenharmony_ci 22518c2ecf20Sopenharmony_ci if (PageWriteback(page)) { 22528c2ecf20Sopenharmony_ci if (wbc->sync_mode != WB_SYNC_NONE) 22538c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 22548c2ecf20Sopenharmony_ci else 22558c2ecf20Sopenharmony_ci goto continue_unlock; 22568c2ecf20Sopenharmony_ci } 22578c2ecf20Sopenharmony_ci 22588c2ecf20Sopenharmony_ci BUG_ON(PageWriteback(page)); 22598c2ecf20Sopenharmony_ci if (!clear_page_dirty_for_io(page)) 22608c2ecf20Sopenharmony_ci goto continue_unlock; 22618c2ecf20Sopenharmony_ci 22628c2ecf20Sopenharmony_ci trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); 22638c2ecf20Sopenharmony_ci error = (*writepage)(page, wbc, data); 22648c2ecf20Sopenharmony_ci if (unlikely(error)) { 22658c2ecf20Sopenharmony_ci /* 22668c2ecf20Sopenharmony_ci * Handle errors according to the type of 22678c2ecf20Sopenharmony_ci * writeback. There's no need to continue for 22688c2ecf20Sopenharmony_ci * background writeback. Just push done_index 22698c2ecf20Sopenharmony_ci * past this page so media errors won't choke 22708c2ecf20Sopenharmony_ci * writeout for the entire file. For integrity 22718c2ecf20Sopenharmony_ci * writeback, we must process the entire dirty 22728c2ecf20Sopenharmony_ci * set regardless of errors because the fs may 22738c2ecf20Sopenharmony_ci * still have state to clear for each page. In 22748c2ecf20Sopenharmony_ci * that case we continue processing and return 22758c2ecf20Sopenharmony_ci * the first error. 22768c2ecf20Sopenharmony_ci */ 22778c2ecf20Sopenharmony_ci if (error == AOP_WRITEPAGE_ACTIVATE) { 22788c2ecf20Sopenharmony_ci unlock_page(page); 22798c2ecf20Sopenharmony_ci error = 0; 22808c2ecf20Sopenharmony_ci } else if (wbc->sync_mode != WB_SYNC_ALL) { 22818c2ecf20Sopenharmony_ci ret = error; 22828c2ecf20Sopenharmony_ci done_index = page->index + 1; 22838c2ecf20Sopenharmony_ci done = 1; 22848c2ecf20Sopenharmony_ci break; 22858c2ecf20Sopenharmony_ci } 22868c2ecf20Sopenharmony_ci if (!ret) 22878c2ecf20Sopenharmony_ci ret = error; 22888c2ecf20Sopenharmony_ci } 22898c2ecf20Sopenharmony_ci 22908c2ecf20Sopenharmony_ci /* 22918c2ecf20Sopenharmony_ci * We stop writing back only if we are not doing 22928c2ecf20Sopenharmony_ci * integrity sync. In case of integrity sync we have to 22938c2ecf20Sopenharmony_ci * keep going until we have written all the pages 22948c2ecf20Sopenharmony_ci * we tagged for writeback prior to entering this loop. 22958c2ecf20Sopenharmony_ci */ 22968c2ecf20Sopenharmony_ci if (--wbc->nr_to_write <= 0 && 22978c2ecf20Sopenharmony_ci wbc->sync_mode == WB_SYNC_NONE) { 22988c2ecf20Sopenharmony_ci done = 1; 22998c2ecf20Sopenharmony_ci break; 23008c2ecf20Sopenharmony_ci } 23018c2ecf20Sopenharmony_ci } 23028c2ecf20Sopenharmony_ci pagevec_release(&pvec); 23038c2ecf20Sopenharmony_ci cond_resched(); 23048c2ecf20Sopenharmony_ci } 23058c2ecf20Sopenharmony_ci 23068c2ecf20Sopenharmony_ci /* 23078c2ecf20Sopenharmony_ci * If we hit the last page and there is more work to be done: wrap 23088c2ecf20Sopenharmony_ci * back the index back to the start of the file for the next 23098c2ecf20Sopenharmony_ci * time we are called. 23108c2ecf20Sopenharmony_ci */ 23118c2ecf20Sopenharmony_ci if (wbc->range_cyclic && !done) 23128c2ecf20Sopenharmony_ci done_index = 0; 23138c2ecf20Sopenharmony_ci if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 23148c2ecf20Sopenharmony_ci mapping->writeback_index = done_index; 23158c2ecf20Sopenharmony_ci 23168c2ecf20Sopenharmony_ci return ret; 23178c2ecf20Sopenharmony_ci} 23188c2ecf20Sopenharmony_ciEXPORT_SYMBOL(write_cache_pages); 23198c2ecf20Sopenharmony_ci 23208c2ecf20Sopenharmony_ci/* 23218c2ecf20Sopenharmony_ci * Function used by generic_writepages to call the real writepage 23228c2ecf20Sopenharmony_ci * function and set the mapping flags on error 23238c2ecf20Sopenharmony_ci */ 23248c2ecf20Sopenharmony_cistatic int __writepage(struct page *page, struct writeback_control *wbc, 23258c2ecf20Sopenharmony_ci void *data) 23268c2ecf20Sopenharmony_ci{ 23278c2ecf20Sopenharmony_ci struct address_space *mapping = data; 23288c2ecf20Sopenharmony_ci int ret = mapping->a_ops->writepage(page, wbc); 23298c2ecf20Sopenharmony_ci mapping_set_error(mapping, ret); 23308c2ecf20Sopenharmony_ci return ret; 23318c2ecf20Sopenharmony_ci} 23328c2ecf20Sopenharmony_ci 23338c2ecf20Sopenharmony_ci/** 23348c2ecf20Sopenharmony_ci * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. 23358c2ecf20Sopenharmony_ci * @mapping: address space structure to write 23368c2ecf20Sopenharmony_ci * @wbc: subtract the number of written pages from *@wbc->nr_to_write 23378c2ecf20Sopenharmony_ci * 23388c2ecf20Sopenharmony_ci * This is a library function, which implements the writepages() 23398c2ecf20Sopenharmony_ci * address_space_operation. 23408c2ecf20Sopenharmony_ci * 23418c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise 23428c2ecf20Sopenharmony_ci */ 23438c2ecf20Sopenharmony_ciint generic_writepages(struct address_space *mapping, 23448c2ecf20Sopenharmony_ci struct writeback_control *wbc) 23458c2ecf20Sopenharmony_ci{ 23468c2ecf20Sopenharmony_ci struct blk_plug plug; 23478c2ecf20Sopenharmony_ci int ret; 23488c2ecf20Sopenharmony_ci 23498c2ecf20Sopenharmony_ci /* deal with chardevs and other special file */ 23508c2ecf20Sopenharmony_ci if (!mapping->a_ops->writepage) 23518c2ecf20Sopenharmony_ci return 0; 23528c2ecf20Sopenharmony_ci 23538c2ecf20Sopenharmony_ci blk_start_plug(&plug); 23548c2ecf20Sopenharmony_ci ret = write_cache_pages(mapping, wbc, __writepage, mapping); 23558c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 23568c2ecf20Sopenharmony_ci return ret; 23578c2ecf20Sopenharmony_ci} 23588c2ecf20Sopenharmony_ci 23598c2ecf20Sopenharmony_ciEXPORT_SYMBOL(generic_writepages); 23608c2ecf20Sopenharmony_ci 23618c2ecf20Sopenharmony_ciint do_writepages(struct address_space *mapping, struct writeback_control *wbc) 23628c2ecf20Sopenharmony_ci{ 23638c2ecf20Sopenharmony_ci int ret; 23648c2ecf20Sopenharmony_ci 23658c2ecf20Sopenharmony_ci if (wbc->nr_to_write <= 0) 23668c2ecf20Sopenharmony_ci return 0; 23678c2ecf20Sopenharmony_ci while (1) { 23688c2ecf20Sopenharmony_ci if (mapping->a_ops->writepages) 23698c2ecf20Sopenharmony_ci ret = mapping->a_ops->writepages(mapping, wbc); 23708c2ecf20Sopenharmony_ci else 23718c2ecf20Sopenharmony_ci ret = generic_writepages(mapping, wbc); 23728c2ecf20Sopenharmony_ci if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) 23738c2ecf20Sopenharmony_ci break; 23748c2ecf20Sopenharmony_ci cond_resched(); 23758c2ecf20Sopenharmony_ci congestion_wait(BLK_RW_ASYNC, HZ/50); 23768c2ecf20Sopenharmony_ci } 23778c2ecf20Sopenharmony_ci return ret; 23788c2ecf20Sopenharmony_ci} 23798c2ecf20Sopenharmony_ci 23808c2ecf20Sopenharmony_ci/** 23818c2ecf20Sopenharmony_ci * write_one_page - write out a single page and wait on I/O 23828c2ecf20Sopenharmony_ci * @page: the page to write 23838c2ecf20Sopenharmony_ci * 23848c2ecf20Sopenharmony_ci * The page must be locked by the caller and will be unlocked upon return. 23858c2ecf20Sopenharmony_ci * 23868c2ecf20Sopenharmony_ci * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this 23878c2ecf20Sopenharmony_ci * function returns. 23888c2ecf20Sopenharmony_ci * 23898c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise 23908c2ecf20Sopenharmony_ci */ 23918c2ecf20Sopenharmony_ciint write_one_page(struct page *page) 23928c2ecf20Sopenharmony_ci{ 23938c2ecf20Sopenharmony_ci struct address_space *mapping = page->mapping; 23948c2ecf20Sopenharmony_ci int ret = 0; 23958c2ecf20Sopenharmony_ci struct writeback_control wbc = { 23968c2ecf20Sopenharmony_ci .sync_mode = WB_SYNC_ALL, 23978c2ecf20Sopenharmony_ci .nr_to_write = 1, 23988c2ecf20Sopenharmony_ci }; 23998c2ecf20Sopenharmony_ci 24008c2ecf20Sopenharmony_ci BUG_ON(!PageLocked(page)); 24018c2ecf20Sopenharmony_ci 24028c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 24038c2ecf20Sopenharmony_ci 24048c2ecf20Sopenharmony_ci if (clear_page_dirty_for_io(page)) { 24058c2ecf20Sopenharmony_ci get_page(page); 24068c2ecf20Sopenharmony_ci ret = mapping->a_ops->writepage(page, &wbc); 24078c2ecf20Sopenharmony_ci if (ret == 0) 24088c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 24098c2ecf20Sopenharmony_ci put_page(page); 24108c2ecf20Sopenharmony_ci } else { 24118c2ecf20Sopenharmony_ci unlock_page(page); 24128c2ecf20Sopenharmony_ci } 24138c2ecf20Sopenharmony_ci 24148c2ecf20Sopenharmony_ci if (!ret) 24158c2ecf20Sopenharmony_ci ret = filemap_check_errors(mapping); 24168c2ecf20Sopenharmony_ci return ret; 24178c2ecf20Sopenharmony_ci} 24188c2ecf20Sopenharmony_ciEXPORT_SYMBOL(write_one_page); 24198c2ecf20Sopenharmony_ci 24208c2ecf20Sopenharmony_ci/* 24218c2ecf20Sopenharmony_ci * For address_spaces which do not use buffers nor write back. 24228c2ecf20Sopenharmony_ci */ 24238c2ecf20Sopenharmony_ciint __set_page_dirty_no_writeback(struct page *page) 24248c2ecf20Sopenharmony_ci{ 24258c2ecf20Sopenharmony_ci if (!PageDirty(page)) 24268c2ecf20Sopenharmony_ci return !TestSetPageDirty(page); 24278c2ecf20Sopenharmony_ci return 0; 24288c2ecf20Sopenharmony_ci} 24298c2ecf20Sopenharmony_ci 24308c2ecf20Sopenharmony_ci/* 24318c2ecf20Sopenharmony_ci * Helper function for set_page_dirty family. 24328c2ecf20Sopenharmony_ci * 24338c2ecf20Sopenharmony_ci * Caller must hold lock_page_memcg(). 24348c2ecf20Sopenharmony_ci * 24358c2ecf20Sopenharmony_ci * NOTE: This relies on being atomic wrt interrupts. 24368c2ecf20Sopenharmony_ci */ 24378c2ecf20Sopenharmony_civoid account_page_dirtied(struct page *page, struct address_space *mapping) 24388c2ecf20Sopenharmony_ci{ 24398c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 24408c2ecf20Sopenharmony_ci 24418c2ecf20Sopenharmony_ci trace_writeback_dirty_page(page, mapping); 24428c2ecf20Sopenharmony_ci 24438c2ecf20Sopenharmony_ci if (mapping_can_writeback(mapping)) { 24448c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 24458c2ecf20Sopenharmony_ci 24468c2ecf20Sopenharmony_ci inode_attach_wb(inode, page); 24478c2ecf20Sopenharmony_ci wb = inode_to_wb(inode); 24488c2ecf20Sopenharmony_ci 24498c2ecf20Sopenharmony_ci __inc_lruvec_page_state(page, NR_FILE_DIRTY); 24508c2ecf20Sopenharmony_ci __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); 24518c2ecf20Sopenharmony_ci __inc_node_page_state(page, NR_DIRTIED); 24528c2ecf20Sopenharmony_ci inc_wb_stat(wb, WB_RECLAIMABLE); 24538c2ecf20Sopenharmony_ci inc_wb_stat(wb, WB_DIRTIED); 24548c2ecf20Sopenharmony_ci task_io_account_write(PAGE_SIZE); 24558c2ecf20Sopenharmony_ci current->nr_dirtied++; 24568c2ecf20Sopenharmony_ci this_cpu_inc(bdp_ratelimits); 24578c2ecf20Sopenharmony_ci 24588c2ecf20Sopenharmony_ci mem_cgroup_track_foreign_dirty(page, wb); 24598c2ecf20Sopenharmony_ci } 24608c2ecf20Sopenharmony_ci} 24618c2ecf20Sopenharmony_ci 24628c2ecf20Sopenharmony_ci/* 24638c2ecf20Sopenharmony_ci * Helper function for deaccounting dirty page without writeback. 24648c2ecf20Sopenharmony_ci * 24658c2ecf20Sopenharmony_ci * Caller must hold lock_page_memcg(). 24668c2ecf20Sopenharmony_ci */ 24678c2ecf20Sopenharmony_civoid account_page_cleaned(struct page *page, struct address_space *mapping, 24688c2ecf20Sopenharmony_ci struct bdi_writeback *wb) 24698c2ecf20Sopenharmony_ci{ 24708c2ecf20Sopenharmony_ci if (mapping_can_writeback(mapping)) { 24718c2ecf20Sopenharmony_ci dec_lruvec_page_state(page, NR_FILE_DIRTY); 24728c2ecf20Sopenharmony_ci dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); 24738c2ecf20Sopenharmony_ci dec_wb_stat(wb, WB_RECLAIMABLE); 24748c2ecf20Sopenharmony_ci task_io_account_cancelled_write(PAGE_SIZE); 24758c2ecf20Sopenharmony_ci } 24768c2ecf20Sopenharmony_ci} 24778c2ecf20Sopenharmony_ci 24788c2ecf20Sopenharmony_ci/* 24798c2ecf20Sopenharmony_ci * For address_spaces which do not use buffers. Just tag the page as dirty in 24808c2ecf20Sopenharmony_ci * the xarray. 24818c2ecf20Sopenharmony_ci * 24828c2ecf20Sopenharmony_ci * This is also used when a single buffer is being dirtied: we want to set the 24838c2ecf20Sopenharmony_ci * page dirty in that case, but not all the buffers. This is a "bottom-up" 24848c2ecf20Sopenharmony_ci * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. 24858c2ecf20Sopenharmony_ci * 24868c2ecf20Sopenharmony_ci * The caller must ensure this doesn't race with truncation. Most will simply 24878c2ecf20Sopenharmony_ci * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and 24888c2ecf20Sopenharmony_ci * the pte lock held, which also locks out truncation. 24898c2ecf20Sopenharmony_ci */ 24908c2ecf20Sopenharmony_ciint __set_page_dirty_nobuffers(struct page *page) 24918c2ecf20Sopenharmony_ci{ 24928c2ecf20Sopenharmony_ci lock_page_memcg(page); 24938c2ecf20Sopenharmony_ci if (!TestSetPageDirty(page)) { 24948c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(page); 24958c2ecf20Sopenharmony_ci unsigned long flags; 24968c2ecf20Sopenharmony_ci 24978c2ecf20Sopenharmony_ci if (!mapping) { 24988c2ecf20Sopenharmony_ci unlock_page_memcg(page); 24998c2ecf20Sopenharmony_ci return 1; 25008c2ecf20Sopenharmony_ci } 25018c2ecf20Sopenharmony_ci 25028c2ecf20Sopenharmony_ci xa_lock_irqsave(&mapping->i_pages, flags); 25038c2ecf20Sopenharmony_ci BUG_ON(page_mapping(page) != mapping); 25048c2ecf20Sopenharmony_ci WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 25058c2ecf20Sopenharmony_ci account_page_dirtied(page, mapping); 25068c2ecf20Sopenharmony_ci __xa_set_mark(&mapping->i_pages, page_index(page), 25078c2ecf20Sopenharmony_ci PAGECACHE_TAG_DIRTY); 25088c2ecf20Sopenharmony_ci xa_unlock_irqrestore(&mapping->i_pages, flags); 25098c2ecf20Sopenharmony_ci unlock_page_memcg(page); 25108c2ecf20Sopenharmony_ci 25118c2ecf20Sopenharmony_ci if (mapping->host) { 25128c2ecf20Sopenharmony_ci /* !PageAnon && !swapper_space */ 25138c2ecf20Sopenharmony_ci __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 25148c2ecf20Sopenharmony_ci } 25158c2ecf20Sopenharmony_ci return 1; 25168c2ecf20Sopenharmony_ci } 25178c2ecf20Sopenharmony_ci unlock_page_memcg(page); 25188c2ecf20Sopenharmony_ci return 0; 25198c2ecf20Sopenharmony_ci} 25208c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__set_page_dirty_nobuffers); 25218c2ecf20Sopenharmony_ci 25228c2ecf20Sopenharmony_ci/* 25238c2ecf20Sopenharmony_ci * Call this whenever redirtying a page, to de-account the dirty counters 25248c2ecf20Sopenharmony_ci * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written 25258c2ecf20Sopenharmony_ci * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to 25268c2ecf20Sopenharmony_ci * systematic errors in balanced_dirty_ratelimit and the dirty pages position 25278c2ecf20Sopenharmony_ci * control. 25288c2ecf20Sopenharmony_ci */ 25298c2ecf20Sopenharmony_civoid account_page_redirty(struct page *page) 25308c2ecf20Sopenharmony_ci{ 25318c2ecf20Sopenharmony_ci struct address_space *mapping = page->mapping; 25328c2ecf20Sopenharmony_ci 25338c2ecf20Sopenharmony_ci if (mapping && mapping_can_writeback(mapping)) { 25348c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 25358c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 25368c2ecf20Sopenharmony_ci struct wb_lock_cookie cookie = {}; 25378c2ecf20Sopenharmony_ci 25388c2ecf20Sopenharmony_ci wb = unlocked_inode_to_wb_begin(inode, &cookie); 25398c2ecf20Sopenharmony_ci current->nr_dirtied--; 25408c2ecf20Sopenharmony_ci dec_node_page_state(page, NR_DIRTIED); 25418c2ecf20Sopenharmony_ci dec_wb_stat(wb, WB_DIRTIED); 25428c2ecf20Sopenharmony_ci unlocked_inode_to_wb_end(inode, &cookie); 25438c2ecf20Sopenharmony_ci } 25448c2ecf20Sopenharmony_ci} 25458c2ecf20Sopenharmony_ciEXPORT_SYMBOL(account_page_redirty); 25468c2ecf20Sopenharmony_ci 25478c2ecf20Sopenharmony_ci/* 25488c2ecf20Sopenharmony_ci * When a writepage implementation decides that it doesn't want to write this 25498c2ecf20Sopenharmony_ci * page for some reason, it should redirty the locked page via 25508c2ecf20Sopenharmony_ci * redirty_page_for_writepage() and it should then unlock the page and return 0 25518c2ecf20Sopenharmony_ci */ 25528c2ecf20Sopenharmony_ciint redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) 25538c2ecf20Sopenharmony_ci{ 25548c2ecf20Sopenharmony_ci int ret; 25558c2ecf20Sopenharmony_ci 25568c2ecf20Sopenharmony_ci wbc->pages_skipped++; 25578c2ecf20Sopenharmony_ci ret = __set_page_dirty_nobuffers(page); 25588c2ecf20Sopenharmony_ci account_page_redirty(page); 25598c2ecf20Sopenharmony_ci return ret; 25608c2ecf20Sopenharmony_ci} 25618c2ecf20Sopenharmony_ciEXPORT_SYMBOL(redirty_page_for_writepage); 25628c2ecf20Sopenharmony_ci 25638c2ecf20Sopenharmony_ci/* 25648c2ecf20Sopenharmony_ci * Dirty a page. 25658c2ecf20Sopenharmony_ci * 25668c2ecf20Sopenharmony_ci * For pages with a mapping this should be done under the page lock 25678c2ecf20Sopenharmony_ci * for the benefit of asynchronous memory errors who prefer a consistent 25688c2ecf20Sopenharmony_ci * dirty state. This rule can be broken in some special cases, 25698c2ecf20Sopenharmony_ci * but should be better not to. 25708c2ecf20Sopenharmony_ci * 25718c2ecf20Sopenharmony_ci * If the mapping doesn't provide a set_page_dirty a_op, then 25728c2ecf20Sopenharmony_ci * just fall through and assume that it wants buffer_heads. 25738c2ecf20Sopenharmony_ci */ 25748c2ecf20Sopenharmony_ciint set_page_dirty(struct page *page) 25758c2ecf20Sopenharmony_ci{ 25768c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(page); 25778c2ecf20Sopenharmony_ci 25788c2ecf20Sopenharmony_ci page = compound_head(page); 25798c2ecf20Sopenharmony_ci if (likely(mapping)) { 25808c2ecf20Sopenharmony_ci int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 25818c2ecf20Sopenharmony_ci /* 25828c2ecf20Sopenharmony_ci * readahead/lru_deactivate_page could remain 25838c2ecf20Sopenharmony_ci * PG_readahead/PG_reclaim due to race with end_page_writeback 25848c2ecf20Sopenharmony_ci * About readahead, if the page is written, the flags would be 25858c2ecf20Sopenharmony_ci * reset. So no problem. 25868c2ecf20Sopenharmony_ci * About lru_deactivate_page, if the page is redirty, the flag 25878c2ecf20Sopenharmony_ci * will be reset. So no problem. but if the page is used by readahead 25888c2ecf20Sopenharmony_ci * it will confuse readahead and make it restart the size rampup 25898c2ecf20Sopenharmony_ci * process. But it's a trivial problem. 25908c2ecf20Sopenharmony_ci */ 25918c2ecf20Sopenharmony_ci if (PageReclaim(page)) 25928c2ecf20Sopenharmony_ci ClearPageReclaim(page); 25938c2ecf20Sopenharmony_ci#ifdef CONFIG_BLOCK 25948c2ecf20Sopenharmony_ci if (!spd) 25958c2ecf20Sopenharmony_ci spd = __set_page_dirty_buffers; 25968c2ecf20Sopenharmony_ci#endif 25978c2ecf20Sopenharmony_ci return (*spd)(page); 25988c2ecf20Sopenharmony_ci } 25998c2ecf20Sopenharmony_ci if (!PageDirty(page)) { 26008c2ecf20Sopenharmony_ci if (!TestSetPageDirty(page)) 26018c2ecf20Sopenharmony_ci return 1; 26028c2ecf20Sopenharmony_ci } 26038c2ecf20Sopenharmony_ci return 0; 26048c2ecf20Sopenharmony_ci} 26058c2ecf20Sopenharmony_ciEXPORT_SYMBOL(set_page_dirty); 26068c2ecf20Sopenharmony_ci 26078c2ecf20Sopenharmony_ci/* 26088c2ecf20Sopenharmony_ci * set_page_dirty() is racy if the caller has no reference against 26098c2ecf20Sopenharmony_ci * page->mapping->host, and if the page is unlocked. This is because another 26108c2ecf20Sopenharmony_ci * CPU could truncate the page off the mapping and then free the mapping. 26118c2ecf20Sopenharmony_ci * 26128c2ecf20Sopenharmony_ci * Usually, the page _is_ locked, or the caller is a user-space process which 26138c2ecf20Sopenharmony_ci * holds a reference on the inode by having an open file. 26148c2ecf20Sopenharmony_ci * 26158c2ecf20Sopenharmony_ci * In other cases, the page should be locked before running set_page_dirty(). 26168c2ecf20Sopenharmony_ci */ 26178c2ecf20Sopenharmony_ciint set_page_dirty_lock(struct page *page) 26188c2ecf20Sopenharmony_ci{ 26198c2ecf20Sopenharmony_ci int ret; 26208c2ecf20Sopenharmony_ci 26218c2ecf20Sopenharmony_ci lock_page(page); 26228c2ecf20Sopenharmony_ci ret = set_page_dirty(page); 26238c2ecf20Sopenharmony_ci unlock_page(page); 26248c2ecf20Sopenharmony_ci return ret; 26258c2ecf20Sopenharmony_ci} 26268c2ecf20Sopenharmony_ciEXPORT_SYMBOL(set_page_dirty_lock); 26278c2ecf20Sopenharmony_ci 26288c2ecf20Sopenharmony_ci/* 26298c2ecf20Sopenharmony_ci * This cancels just the dirty bit on the kernel page itself, it does NOT 26308c2ecf20Sopenharmony_ci * actually remove dirty bits on any mmap's that may be around. It also 26318c2ecf20Sopenharmony_ci * leaves the page tagged dirty, so any sync activity will still find it on 26328c2ecf20Sopenharmony_ci * the dirty lists, and in particular, clear_page_dirty_for_io() will still 26338c2ecf20Sopenharmony_ci * look at the dirty bits in the VM. 26348c2ecf20Sopenharmony_ci * 26358c2ecf20Sopenharmony_ci * Doing this should *normally* only ever be done when a page is truncated, 26368c2ecf20Sopenharmony_ci * and is not actually mapped anywhere at all. However, fs/buffer.c does 26378c2ecf20Sopenharmony_ci * this when it notices that somebody has cleaned out all the buffers on a 26388c2ecf20Sopenharmony_ci * page without actually doing it through the VM. Can you say "ext3 is 26398c2ecf20Sopenharmony_ci * horribly ugly"? Thought you could. 26408c2ecf20Sopenharmony_ci */ 26418c2ecf20Sopenharmony_civoid __cancel_dirty_page(struct page *page) 26428c2ecf20Sopenharmony_ci{ 26438c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(page); 26448c2ecf20Sopenharmony_ci 26458c2ecf20Sopenharmony_ci if (mapping_can_writeback(mapping)) { 26468c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 26478c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 26488c2ecf20Sopenharmony_ci struct wb_lock_cookie cookie = {}; 26498c2ecf20Sopenharmony_ci 26508c2ecf20Sopenharmony_ci lock_page_memcg(page); 26518c2ecf20Sopenharmony_ci wb = unlocked_inode_to_wb_begin(inode, &cookie); 26528c2ecf20Sopenharmony_ci 26538c2ecf20Sopenharmony_ci if (TestClearPageDirty(page)) 26548c2ecf20Sopenharmony_ci account_page_cleaned(page, mapping, wb); 26558c2ecf20Sopenharmony_ci 26568c2ecf20Sopenharmony_ci unlocked_inode_to_wb_end(inode, &cookie); 26578c2ecf20Sopenharmony_ci unlock_page_memcg(page); 26588c2ecf20Sopenharmony_ci } else { 26598c2ecf20Sopenharmony_ci ClearPageDirty(page); 26608c2ecf20Sopenharmony_ci } 26618c2ecf20Sopenharmony_ci} 26628c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__cancel_dirty_page); 26638c2ecf20Sopenharmony_ci 26648c2ecf20Sopenharmony_ci/* 26658c2ecf20Sopenharmony_ci * Clear a page's dirty flag, while caring for dirty memory accounting. 26668c2ecf20Sopenharmony_ci * Returns true if the page was previously dirty. 26678c2ecf20Sopenharmony_ci * 26688c2ecf20Sopenharmony_ci * This is for preparing to put the page under writeout. We leave the page 26698c2ecf20Sopenharmony_ci * tagged as dirty in the xarray so that a concurrent write-for-sync 26708c2ecf20Sopenharmony_ci * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage 26718c2ecf20Sopenharmony_ci * implementation will run either set_page_writeback() or set_page_dirty(), 26728c2ecf20Sopenharmony_ci * at which stage we bring the page's dirty flag and xarray dirty tag 26738c2ecf20Sopenharmony_ci * back into sync. 26748c2ecf20Sopenharmony_ci * 26758c2ecf20Sopenharmony_ci * This incoherency between the page's dirty flag and xarray tag is 26768c2ecf20Sopenharmony_ci * unfortunate, but it only exists while the page is locked. 26778c2ecf20Sopenharmony_ci */ 26788c2ecf20Sopenharmony_ciint clear_page_dirty_for_io(struct page *page) 26798c2ecf20Sopenharmony_ci{ 26808c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(page); 26818c2ecf20Sopenharmony_ci int ret = 0; 26828c2ecf20Sopenharmony_ci 26838c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 26848c2ecf20Sopenharmony_ci 26858c2ecf20Sopenharmony_ci if (mapping && mapping_can_writeback(mapping)) { 26868c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 26878c2ecf20Sopenharmony_ci struct bdi_writeback *wb; 26888c2ecf20Sopenharmony_ci struct wb_lock_cookie cookie = {}; 26898c2ecf20Sopenharmony_ci 26908c2ecf20Sopenharmony_ci /* 26918c2ecf20Sopenharmony_ci * Yes, Virginia, this is indeed insane. 26928c2ecf20Sopenharmony_ci * 26938c2ecf20Sopenharmony_ci * We use this sequence to make sure that 26948c2ecf20Sopenharmony_ci * (a) we account for dirty stats properly 26958c2ecf20Sopenharmony_ci * (b) we tell the low-level filesystem to 26968c2ecf20Sopenharmony_ci * mark the whole page dirty if it was 26978c2ecf20Sopenharmony_ci * dirty in a pagetable. Only to then 26988c2ecf20Sopenharmony_ci * (c) clean the page again and return 1 to 26998c2ecf20Sopenharmony_ci * cause the writeback. 27008c2ecf20Sopenharmony_ci * 27018c2ecf20Sopenharmony_ci * This way we avoid all nasty races with the 27028c2ecf20Sopenharmony_ci * dirty bit in multiple places and clearing 27038c2ecf20Sopenharmony_ci * them concurrently from different threads. 27048c2ecf20Sopenharmony_ci * 27058c2ecf20Sopenharmony_ci * Note! Normally the "set_page_dirty(page)" 27068c2ecf20Sopenharmony_ci * has no effect on the actual dirty bit - since 27078c2ecf20Sopenharmony_ci * that will already usually be set. But we 27088c2ecf20Sopenharmony_ci * need the side effects, and it can help us 27098c2ecf20Sopenharmony_ci * avoid races. 27108c2ecf20Sopenharmony_ci * 27118c2ecf20Sopenharmony_ci * We basically use the page "master dirty bit" 27128c2ecf20Sopenharmony_ci * as a serialization point for all the different 27138c2ecf20Sopenharmony_ci * threads doing their things. 27148c2ecf20Sopenharmony_ci */ 27158c2ecf20Sopenharmony_ci if (page_mkclean(page)) 27168c2ecf20Sopenharmony_ci set_page_dirty(page); 27178c2ecf20Sopenharmony_ci /* 27188c2ecf20Sopenharmony_ci * We carefully synchronise fault handlers against 27198c2ecf20Sopenharmony_ci * installing a dirty pte and marking the page dirty 27208c2ecf20Sopenharmony_ci * at this point. We do this by having them hold the 27218c2ecf20Sopenharmony_ci * page lock while dirtying the page, and pages are 27228c2ecf20Sopenharmony_ci * always locked coming in here, so we get the desired 27238c2ecf20Sopenharmony_ci * exclusion. 27248c2ecf20Sopenharmony_ci */ 27258c2ecf20Sopenharmony_ci wb = unlocked_inode_to_wb_begin(inode, &cookie); 27268c2ecf20Sopenharmony_ci if (TestClearPageDirty(page)) { 27278c2ecf20Sopenharmony_ci dec_lruvec_page_state(page, NR_FILE_DIRTY); 27288c2ecf20Sopenharmony_ci dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); 27298c2ecf20Sopenharmony_ci dec_wb_stat(wb, WB_RECLAIMABLE); 27308c2ecf20Sopenharmony_ci ret = 1; 27318c2ecf20Sopenharmony_ci } 27328c2ecf20Sopenharmony_ci unlocked_inode_to_wb_end(inode, &cookie); 27338c2ecf20Sopenharmony_ci return ret; 27348c2ecf20Sopenharmony_ci } 27358c2ecf20Sopenharmony_ci return TestClearPageDirty(page); 27368c2ecf20Sopenharmony_ci} 27378c2ecf20Sopenharmony_ciEXPORT_SYMBOL(clear_page_dirty_for_io); 27388c2ecf20Sopenharmony_ci 27398c2ecf20Sopenharmony_ciint test_clear_page_writeback(struct page *page) 27408c2ecf20Sopenharmony_ci{ 27418c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(page); 27428c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 27438c2ecf20Sopenharmony_ci struct lruvec *lruvec; 27448c2ecf20Sopenharmony_ci int ret; 27458c2ecf20Sopenharmony_ci 27468c2ecf20Sopenharmony_ci memcg = lock_page_memcg(page); 27478c2ecf20Sopenharmony_ci lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); 27488c2ecf20Sopenharmony_ci if (mapping && mapping_use_writeback_tags(mapping)) { 27498c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 27508c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(inode); 27518c2ecf20Sopenharmony_ci unsigned long flags; 27528c2ecf20Sopenharmony_ci 27538c2ecf20Sopenharmony_ci xa_lock_irqsave(&mapping->i_pages, flags); 27548c2ecf20Sopenharmony_ci ret = TestClearPageWriteback(page); 27558c2ecf20Sopenharmony_ci if (ret) { 27568c2ecf20Sopenharmony_ci __xa_clear_mark(&mapping->i_pages, page_index(page), 27578c2ecf20Sopenharmony_ci PAGECACHE_TAG_WRITEBACK); 27588c2ecf20Sopenharmony_ci if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { 27598c2ecf20Sopenharmony_ci struct bdi_writeback *wb = inode_to_wb(inode); 27608c2ecf20Sopenharmony_ci 27618c2ecf20Sopenharmony_ci dec_wb_stat(wb, WB_WRITEBACK); 27628c2ecf20Sopenharmony_ci __wb_writeout_inc(wb); 27638c2ecf20Sopenharmony_ci } 27648c2ecf20Sopenharmony_ci } 27658c2ecf20Sopenharmony_ci 27668c2ecf20Sopenharmony_ci if (mapping->host && !mapping_tagged(mapping, 27678c2ecf20Sopenharmony_ci PAGECACHE_TAG_WRITEBACK)) 27688c2ecf20Sopenharmony_ci sb_clear_inode_writeback(mapping->host); 27698c2ecf20Sopenharmony_ci 27708c2ecf20Sopenharmony_ci xa_unlock_irqrestore(&mapping->i_pages, flags); 27718c2ecf20Sopenharmony_ci } else { 27728c2ecf20Sopenharmony_ci ret = TestClearPageWriteback(page); 27738c2ecf20Sopenharmony_ci } 27748c2ecf20Sopenharmony_ci if (ret) { 27758c2ecf20Sopenharmony_ci dec_lruvec_state(lruvec, NR_WRITEBACK); 27768c2ecf20Sopenharmony_ci dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); 27778c2ecf20Sopenharmony_ci inc_node_page_state(page, NR_WRITTEN); 27788c2ecf20Sopenharmony_ci } 27798c2ecf20Sopenharmony_ci __unlock_page_memcg(memcg); 27808c2ecf20Sopenharmony_ci return ret; 27818c2ecf20Sopenharmony_ci} 27828c2ecf20Sopenharmony_ci 27838c2ecf20Sopenharmony_ciint __test_set_page_writeback(struct page *page, bool keep_write) 27848c2ecf20Sopenharmony_ci{ 27858c2ecf20Sopenharmony_ci struct address_space *mapping = page_mapping(page); 27868c2ecf20Sopenharmony_ci int ret, access_ret; 27878c2ecf20Sopenharmony_ci 27888c2ecf20Sopenharmony_ci lock_page_memcg(page); 27898c2ecf20Sopenharmony_ci if (mapping && mapping_use_writeback_tags(mapping)) { 27908c2ecf20Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, page_index(page)); 27918c2ecf20Sopenharmony_ci struct inode *inode = mapping->host; 27928c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(inode); 27938c2ecf20Sopenharmony_ci unsigned long flags; 27948c2ecf20Sopenharmony_ci 27958c2ecf20Sopenharmony_ci xas_lock_irqsave(&xas, flags); 27968c2ecf20Sopenharmony_ci xas_load(&xas); 27978c2ecf20Sopenharmony_ci ret = TestSetPageWriteback(page); 27988c2ecf20Sopenharmony_ci if (!ret) { 27998c2ecf20Sopenharmony_ci bool on_wblist; 28008c2ecf20Sopenharmony_ci 28018c2ecf20Sopenharmony_ci on_wblist = mapping_tagged(mapping, 28028c2ecf20Sopenharmony_ci PAGECACHE_TAG_WRITEBACK); 28038c2ecf20Sopenharmony_ci 28048c2ecf20Sopenharmony_ci xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 28058c2ecf20Sopenharmony_ci if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) 28068c2ecf20Sopenharmony_ci inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); 28078c2ecf20Sopenharmony_ci 28088c2ecf20Sopenharmony_ci /* 28098c2ecf20Sopenharmony_ci * We can come through here when swapping anonymous 28108c2ecf20Sopenharmony_ci * pages, so we don't necessarily have an inode to track 28118c2ecf20Sopenharmony_ci * for sync. 28128c2ecf20Sopenharmony_ci */ 28138c2ecf20Sopenharmony_ci if (mapping->host && !on_wblist) 28148c2ecf20Sopenharmony_ci sb_mark_inode_writeback(mapping->host); 28158c2ecf20Sopenharmony_ci } 28168c2ecf20Sopenharmony_ci if (!PageDirty(page)) 28178c2ecf20Sopenharmony_ci xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 28188c2ecf20Sopenharmony_ci if (!keep_write) 28198c2ecf20Sopenharmony_ci xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 28208c2ecf20Sopenharmony_ci xas_unlock_irqrestore(&xas, flags); 28218c2ecf20Sopenharmony_ci } else { 28228c2ecf20Sopenharmony_ci ret = TestSetPageWriteback(page); 28238c2ecf20Sopenharmony_ci } 28248c2ecf20Sopenharmony_ci if (!ret) { 28258c2ecf20Sopenharmony_ci inc_lruvec_page_state(page, NR_WRITEBACK); 28268c2ecf20Sopenharmony_ci inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); 28278c2ecf20Sopenharmony_ci } 28288c2ecf20Sopenharmony_ci unlock_page_memcg(page); 28298c2ecf20Sopenharmony_ci access_ret = arch_make_page_accessible(page); 28308c2ecf20Sopenharmony_ci /* 28318c2ecf20Sopenharmony_ci * If writeback has been triggered on a page that cannot be made 28328c2ecf20Sopenharmony_ci * accessible, it is too late to recover here. 28338c2ecf20Sopenharmony_ci */ 28348c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(access_ret != 0, page); 28358c2ecf20Sopenharmony_ci 28368c2ecf20Sopenharmony_ci return ret; 28378c2ecf20Sopenharmony_ci 28388c2ecf20Sopenharmony_ci} 28398c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__test_set_page_writeback); 28408c2ecf20Sopenharmony_ci 28418c2ecf20Sopenharmony_ci/* 28428c2ecf20Sopenharmony_ci * Wait for a page to complete writeback 28438c2ecf20Sopenharmony_ci */ 28448c2ecf20Sopenharmony_civoid wait_on_page_writeback(struct page *page) 28458c2ecf20Sopenharmony_ci{ 28468c2ecf20Sopenharmony_ci while (PageWriteback(page)) { 28478c2ecf20Sopenharmony_ci trace_wait_on_page_writeback(page, page_mapping(page)); 28488c2ecf20Sopenharmony_ci wait_on_page_bit(page, PG_writeback); 28498c2ecf20Sopenharmony_ci } 28508c2ecf20Sopenharmony_ci} 28518c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(wait_on_page_writeback); 28528c2ecf20Sopenharmony_ci 28538c2ecf20Sopenharmony_ci/** 28548c2ecf20Sopenharmony_ci * wait_for_stable_page() - wait for writeback to finish, if necessary. 28558c2ecf20Sopenharmony_ci * @page: The page to wait on. 28568c2ecf20Sopenharmony_ci * 28578c2ecf20Sopenharmony_ci * This function determines if the given page is related to a backing device 28588c2ecf20Sopenharmony_ci * that requires page contents to be held stable during writeback. If so, then 28598c2ecf20Sopenharmony_ci * it will wait for any pending writeback to complete. 28608c2ecf20Sopenharmony_ci */ 28618c2ecf20Sopenharmony_civoid wait_for_stable_page(struct page *page) 28628c2ecf20Sopenharmony_ci{ 28638c2ecf20Sopenharmony_ci page = thp_head(page); 28648c2ecf20Sopenharmony_ci if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) 28658c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 28668c2ecf20Sopenharmony_ci} 28678c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(wait_for_stable_page); 2868