162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * fs/fs-writeback.c
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2002, Linus Torvalds.
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Contains all the functions related to writing back and waiting
862306a36Sopenharmony_ci * upon dirty inodes against superblocks, and writing back dirty
962306a36Sopenharmony_ci * pages against inodes.  ie: data writeback.  Writeout of the
1062306a36Sopenharmony_ci * inode itself is not handled here.
1162306a36Sopenharmony_ci *
1262306a36Sopenharmony_ci * 10Apr2002	Andrew Morton
1362306a36Sopenharmony_ci *		Split out of fs/inode.c
1462306a36Sopenharmony_ci *		Additions for address_space-based writeback
1562306a36Sopenharmony_ci */
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#include <linux/kernel.h>
1862306a36Sopenharmony_ci#include <linux/export.h>
1962306a36Sopenharmony_ci#include <linux/spinlock.h>
2062306a36Sopenharmony_ci#include <linux/slab.h>
2162306a36Sopenharmony_ci#include <linux/sched.h>
2262306a36Sopenharmony_ci#include <linux/fs.h>
2362306a36Sopenharmony_ci#include <linux/mm.h>
2462306a36Sopenharmony_ci#include <linux/pagemap.h>
2562306a36Sopenharmony_ci#include <linux/kthread.h>
2662306a36Sopenharmony_ci#include <linux/writeback.h>
2762306a36Sopenharmony_ci#include <linux/blkdev.h>
2862306a36Sopenharmony_ci#include <linux/backing-dev.h>
2962306a36Sopenharmony_ci#include <linux/tracepoint.h>
3062306a36Sopenharmony_ci#include <linux/device.h>
3162306a36Sopenharmony_ci#include <linux/memcontrol.h>
3262306a36Sopenharmony_ci#include "internal.h"
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci/*
3562306a36Sopenharmony_ci * 4MB minimal write chunk size
3662306a36Sopenharmony_ci */
3762306a36Sopenharmony_ci#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci/*
4062306a36Sopenharmony_ci * Passed into wb_writeback(), essentially a subset of writeback_control
4162306a36Sopenharmony_ci */
4262306a36Sopenharmony_cistruct wb_writeback_work {
4362306a36Sopenharmony_ci	long nr_pages;
4462306a36Sopenharmony_ci	struct super_block *sb;
4562306a36Sopenharmony_ci	enum writeback_sync_modes sync_mode;
4662306a36Sopenharmony_ci	unsigned int tagged_writepages:1;
4762306a36Sopenharmony_ci	unsigned int for_kupdate:1;
4862306a36Sopenharmony_ci	unsigned int range_cyclic:1;
4962306a36Sopenharmony_ci	unsigned int for_background:1;
5062306a36Sopenharmony_ci	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
5162306a36Sopenharmony_ci	unsigned int auto_free:1;	/* free on completion */
5262306a36Sopenharmony_ci	enum wb_reason reason;		/* why was writeback initiated? */
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci	struct list_head list;		/* pending work list */
5562306a36Sopenharmony_ci	struct wb_completion *done;	/* set if the caller waits */
5662306a36Sopenharmony_ci};
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci/*
5962306a36Sopenharmony_ci * If an inode is constantly having its pages dirtied, but then the
6062306a36Sopenharmony_ci * updates stop dirtytime_expire_interval seconds in the past, it's
6162306a36Sopenharmony_ci * possible for the worst case time between when an inode has its
6262306a36Sopenharmony_ci * timestamps updated and when they finally get written out to be two
6362306a36Sopenharmony_ci * dirtytime_expire_intervals.  We set the default to 12 hours (in
6462306a36Sopenharmony_ci * seconds), which means most of the time inodes will have their
6562306a36Sopenharmony_ci * timestamps written to disk after 12 hours, but in the worst case a
6662306a36Sopenharmony_ci * few inodes might not their timestamps updated for 24 hours.
6762306a36Sopenharmony_ci */
6862306a36Sopenharmony_ciunsigned int dirtytime_expire_interval = 12 * 60 * 60;
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_cistatic inline struct inode *wb_inode(struct list_head *head)
7162306a36Sopenharmony_ci{
7262306a36Sopenharmony_ci	return list_entry(head, struct inode, i_io_list);
7362306a36Sopenharmony_ci}
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ci/*
7662306a36Sopenharmony_ci * Include the creation of the trace points after defining the
7762306a36Sopenharmony_ci * wb_writeback_work structure and inline functions so that the definition
7862306a36Sopenharmony_ci * remains local to this file.
7962306a36Sopenharmony_ci */
8062306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
8162306a36Sopenharmony_ci#include <trace/events/writeback.h>
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ciEXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_cistatic bool wb_io_lists_populated(struct bdi_writeback *wb)
8662306a36Sopenharmony_ci{
8762306a36Sopenharmony_ci	if (wb_has_dirty_io(wb)) {
8862306a36Sopenharmony_ci		return false;
8962306a36Sopenharmony_ci	} else {
9062306a36Sopenharmony_ci		set_bit(WB_has_dirty_io, &wb->state);
9162306a36Sopenharmony_ci		WARN_ON_ONCE(!wb->avg_write_bandwidth);
9262306a36Sopenharmony_ci		atomic_long_add(wb->avg_write_bandwidth,
9362306a36Sopenharmony_ci				&wb->bdi->tot_write_bandwidth);
9462306a36Sopenharmony_ci		return true;
9562306a36Sopenharmony_ci	}
9662306a36Sopenharmony_ci}
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_cistatic void wb_io_lists_depopulated(struct bdi_writeback *wb)
9962306a36Sopenharmony_ci{
10062306a36Sopenharmony_ci	if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
10162306a36Sopenharmony_ci	    list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
10262306a36Sopenharmony_ci		clear_bit(WB_has_dirty_io, &wb->state);
10362306a36Sopenharmony_ci		WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
10462306a36Sopenharmony_ci					&wb->bdi->tot_write_bandwidth) < 0);
10562306a36Sopenharmony_ci	}
10662306a36Sopenharmony_ci}
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci/**
10962306a36Sopenharmony_ci * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
11062306a36Sopenharmony_ci * @inode: inode to be moved
11162306a36Sopenharmony_ci * @wb: target bdi_writeback
11262306a36Sopenharmony_ci * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
11362306a36Sopenharmony_ci *
11462306a36Sopenharmony_ci * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
11562306a36Sopenharmony_ci * Returns %true if @inode is the first occupant of the !dirty_time IO
11662306a36Sopenharmony_ci * lists; otherwise, %false.
11762306a36Sopenharmony_ci */
11862306a36Sopenharmony_cistatic bool inode_io_list_move_locked(struct inode *inode,
11962306a36Sopenharmony_ci				      struct bdi_writeback *wb,
12062306a36Sopenharmony_ci				      struct list_head *head)
12162306a36Sopenharmony_ci{
12262306a36Sopenharmony_ci	assert_spin_locked(&wb->list_lock);
12362306a36Sopenharmony_ci	assert_spin_locked(&inode->i_lock);
12462306a36Sopenharmony_ci	WARN_ON_ONCE(inode->i_state & I_FREEING);
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	list_move(&inode->i_io_list, head);
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	/* dirty_time doesn't count as dirty_io until expiration */
12962306a36Sopenharmony_ci	if (head != &wb->b_dirty_time)
13062306a36Sopenharmony_ci		return wb_io_lists_populated(wb);
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	wb_io_lists_depopulated(wb);
13362306a36Sopenharmony_ci	return false;
13462306a36Sopenharmony_ci}
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_cistatic void wb_wakeup(struct bdi_writeback *wb)
13762306a36Sopenharmony_ci{
13862306a36Sopenharmony_ci	spin_lock_irq(&wb->work_lock);
13962306a36Sopenharmony_ci	if (test_bit(WB_registered, &wb->state))
14062306a36Sopenharmony_ci		mod_delayed_work(bdi_wq, &wb->dwork, 0);
14162306a36Sopenharmony_ci	spin_unlock_irq(&wb->work_lock);
14262306a36Sopenharmony_ci}
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_cistatic void finish_writeback_work(struct bdi_writeback *wb,
14562306a36Sopenharmony_ci				  struct wb_writeback_work *work)
14662306a36Sopenharmony_ci{
14762306a36Sopenharmony_ci	struct wb_completion *done = work->done;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	if (work->auto_free)
15062306a36Sopenharmony_ci		kfree(work);
15162306a36Sopenharmony_ci	if (done) {
15262306a36Sopenharmony_ci		wait_queue_head_t *waitq = done->waitq;
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci		/* @done can't be accessed after the following dec */
15562306a36Sopenharmony_ci		if (atomic_dec_and_test(&done->cnt))
15662306a36Sopenharmony_ci			wake_up_all(waitq);
15762306a36Sopenharmony_ci	}
15862306a36Sopenharmony_ci}
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_cistatic void wb_queue_work(struct bdi_writeback *wb,
16162306a36Sopenharmony_ci			  struct wb_writeback_work *work)
16262306a36Sopenharmony_ci{
16362306a36Sopenharmony_ci	trace_writeback_queue(wb, work);
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci	if (work->done)
16662306a36Sopenharmony_ci		atomic_inc(&work->done->cnt);
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci	spin_lock_irq(&wb->work_lock);
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	if (test_bit(WB_registered, &wb->state)) {
17162306a36Sopenharmony_ci		list_add_tail(&work->list, &wb->work_list);
17262306a36Sopenharmony_ci		mod_delayed_work(bdi_wq, &wb->dwork, 0);
17362306a36Sopenharmony_ci	} else
17462306a36Sopenharmony_ci		finish_writeback_work(wb, work);
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	spin_unlock_irq(&wb->work_lock);
17762306a36Sopenharmony_ci}
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ci/**
18062306a36Sopenharmony_ci * wb_wait_for_completion - wait for completion of bdi_writeback_works
18162306a36Sopenharmony_ci * @done: target wb_completion
18262306a36Sopenharmony_ci *
18362306a36Sopenharmony_ci * Wait for one or more work items issued to @bdi with their ->done field
18462306a36Sopenharmony_ci * set to @done, which should have been initialized with
18562306a36Sopenharmony_ci * DEFINE_WB_COMPLETION().  This function returns after all such work items
18662306a36Sopenharmony_ci * are completed.  Work items which are waited upon aren't freed
18762306a36Sopenharmony_ci * automatically on completion.
18862306a36Sopenharmony_ci */
18962306a36Sopenharmony_civoid wb_wait_for_completion(struct wb_completion *done)
19062306a36Sopenharmony_ci{
19162306a36Sopenharmony_ci	atomic_dec(&done->cnt);		/* put down the initial count */
19262306a36Sopenharmony_ci	wait_event(*done->waitq, !atomic_read(&done->cnt));
19362306a36Sopenharmony_ci}
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci/*
19862306a36Sopenharmony_ci * Parameters for foreign inode detection, see wbc_detach_inode() to see
19962306a36Sopenharmony_ci * how they're used.
20062306a36Sopenharmony_ci *
20162306a36Sopenharmony_ci * These paramters are inherently heuristical as the detection target
20262306a36Sopenharmony_ci * itself is fuzzy.  All we want to do is detaching an inode from the
20362306a36Sopenharmony_ci * current owner if it's being written to by some other cgroups too much.
20462306a36Sopenharmony_ci *
20562306a36Sopenharmony_ci * The current cgroup writeback is built on the assumption that multiple
20662306a36Sopenharmony_ci * cgroups writing to the same inode concurrently is very rare and a mode
20762306a36Sopenharmony_ci * of operation which isn't well supported.  As such, the goal is not
20862306a36Sopenharmony_ci * taking too long when a different cgroup takes over an inode while
20962306a36Sopenharmony_ci * avoiding too aggressive flip-flops from occasional foreign writes.
21062306a36Sopenharmony_ci *
21162306a36Sopenharmony_ci * We record, very roughly, 2s worth of IO time history and if more than
21262306a36Sopenharmony_ci * half of that is foreign, trigger the switch.  The recording is quantized
21362306a36Sopenharmony_ci * to 16 slots.  To avoid tiny writes from swinging the decision too much,
21462306a36Sopenharmony_ci * writes smaller than 1/8 of avg size are ignored.
21562306a36Sopenharmony_ci */
21662306a36Sopenharmony_ci#define WB_FRN_TIME_SHIFT	13	/* 1s = 2^13, upto 8 secs w/ 16bit */
21762306a36Sopenharmony_ci#define WB_FRN_TIME_AVG_SHIFT	3	/* avg = avg * 7/8 + new * 1/8 */
21862306a36Sopenharmony_ci#define WB_FRN_TIME_CUT_DIV	8	/* ignore rounds < avg / 8 */
21962306a36Sopenharmony_ci#define WB_FRN_TIME_PERIOD	(2 * (1 << WB_FRN_TIME_SHIFT))	/* 2s */
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci#define WB_FRN_HIST_SLOTS	16	/* inode->i_wb_frn_history is 16bit */
22262306a36Sopenharmony_ci#define WB_FRN_HIST_UNIT	(WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
22362306a36Sopenharmony_ci					/* each slot's duration is 2s / 16 */
22462306a36Sopenharmony_ci#define WB_FRN_HIST_THR_SLOTS	(WB_FRN_HIST_SLOTS / 2)
22562306a36Sopenharmony_ci					/* if foreign slots >= 8, switch */
22662306a36Sopenharmony_ci#define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
22762306a36Sopenharmony_ci					/* one round can affect upto 5 slots */
22862306a36Sopenharmony_ci#define WB_FRN_MAX_IN_FLIGHT	1024	/* don't queue too many concurrently */
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci/*
23162306a36Sopenharmony_ci * Maximum inodes per isw.  A specific value has been chosen to make
23262306a36Sopenharmony_ci * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
23362306a36Sopenharmony_ci */
23462306a36Sopenharmony_ci#define WB_MAX_INODES_PER_ISW  ((1024UL - sizeof(struct inode_switch_wbs_context)) \
23562306a36Sopenharmony_ci                                / sizeof(struct inode *))
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_cistatic atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
23862306a36Sopenharmony_cistatic struct workqueue_struct *isw_wq;
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_civoid __inode_attach_wb(struct inode *inode, struct folio *folio)
24162306a36Sopenharmony_ci{
24262306a36Sopenharmony_ci	struct backing_dev_info *bdi = inode_to_bdi(inode);
24362306a36Sopenharmony_ci	struct bdi_writeback *wb = NULL;
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	if (inode_cgwb_enabled(inode)) {
24662306a36Sopenharmony_ci		struct cgroup_subsys_state *memcg_css;
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci		if (folio) {
24962306a36Sopenharmony_ci			memcg_css = mem_cgroup_css_from_folio(folio);
25062306a36Sopenharmony_ci			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
25162306a36Sopenharmony_ci		} else {
25262306a36Sopenharmony_ci			/* must pin memcg_css, see wb_get_create() */
25362306a36Sopenharmony_ci			memcg_css = task_get_css(current, memory_cgrp_id);
25462306a36Sopenharmony_ci			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
25562306a36Sopenharmony_ci			css_put(memcg_css);
25662306a36Sopenharmony_ci		}
25762306a36Sopenharmony_ci	}
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	if (!wb)
26062306a36Sopenharmony_ci		wb = &bdi->wb;
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci	/*
26362306a36Sopenharmony_ci	 * There may be multiple instances of this function racing to
26462306a36Sopenharmony_ci	 * update the same inode.  Use cmpxchg() to tell the winner.
26562306a36Sopenharmony_ci	 */
26662306a36Sopenharmony_ci	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
26762306a36Sopenharmony_ci		wb_put(wb);
26862306a36Sopenharmony_ci}
26962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(__inode_attach_wb);
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci/**
27262306a36Sopenharmony_ci * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
27362306a36Sopenharmony_ci * @inode: inode of interest with i_lock held
27462306a36Sopenharmony_ci * @wb: target bdi_writeback
27562306a36Sopenharmony_ci *
27662306a36Sopenharmony_ci * Remove the inode from wb's io lists and if necessarily put onto b_attached
27762306a36Sopenharmony_ci * list.  Only inodes attached to cgwb's are kept on this list.
27862306a36Sopenharmony_ci */
27962306a36Sopenharmony_cistatic void inode_cgwb_move_to_attached(struct inode *inode,
28062306a36Sopenharmony_ci					struct bdi_writeback *wb)
28162306a36Sopenharmony_ci{
28262306a36Sopenharmony_ci	assert_spin_locked(&wb->list_lock);
28362306a36Sopenharmony_ci	assert_spin_locked(&inode->i_lock);
28462306a36Sopenharmony_ci	WARN_ON_ONCE(inode->i_state & I_FREEING);
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	inode->i_state &= ~I_SYNC_QUEUED;
28762306a36Sopenharmony_ci	if (wb != &wb->bdi->wb)
28862306a36Sopenharmony_ci		list_move(&inode->i_io_list, &wb->b_attached);
28962306a36Sopenharmony_ci	else
29062306a36Sopenharmony_ci		list_del_init(&inode->i_io_list);
29162306a36Sopenharmony_ci	wb_io_lists_depopulated(wb);
29262306a36Sopenharmony_ci}
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci/**
29562306a36Sopenharmony_ci * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
29662306a36Sopenharmony_ci * @inode: inode of interest with i_lock held
29762306a36Sopenharmony_ci *
29862306a36Sopenharmony_ci * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
29962306a36Sopenharmony_ci * held on entry and is released on return.  The returned wb is guaranteed
30062306a36Sopenharmony_ci * to stay @inode's associated wb until its list_lock is released.
30162306a36Sopenharmony_ci */
30262306a36Sopenharmony_cistatic struct bdi_writeback *
30362306a36Sopenharmony_cilocked_inode_to_wb_and_lock_list(struct inode *inode)
30462306a36Sopenharmony_ci	__releases(&inode->i_lock)
30562306a36Sopenharmony_ci	__acquires(&wb->list_lock)
30662306a36Sopenharmony_ci{
30762306a36Sopenharmony_ci	while (true) {
30862306a36Sopenharmony_ci		struct bdi_writeback *wb = inode_to_wb(inode);
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci		/*
31162306a36Sopenharmony_ci		 * inode_to_wb() association is protected by both
31262306a36Sopenharmony_ci		 * @inode->i_lock and @wb->list_lock but list_lock nests
31362306a36Sopenharmony_ci		 * outside i_lock.  Drop i_lock and verify that the
31462306a36Sopenharmony_ci		 * association hasn't changed after acquiring list_lock.
31562306a36Sopenharmony_ci		 */
31662306a36Sopenharmony_ci		wb_get(wb);
31762306a36Sopenharmony_ci		spin_unlock(&inode->i_lock);
31862306a36Sopenharmony_ci		spin_lock(&wb->list_lock);
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci		/* i_wb may have changed inbetween, can't use inode_to_wb() */
32162306a36Sopenharmony_ci		if (likely(wb == inode->i_wb)) {
32262306a36Sopenharmony_ci			wb_put(wb);	/* @inode already has ref */
32362306a36Sopenharmony_ci			return wb;
32462306a36Sopenharmony_ci		}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci		spin_unlock(&wb->list_lock);
32762306a36Sopenharmony_ci		wb_put(wb);
32862306a36Sopenharmony_ci		cpu_relax();
32962306a36Sopenharmony_ci		spin_lock(&inode->i_lock);
33062306a36Sopenharmony_ci	}
33162306a36Sopenharmony_ci}
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci/**
33462306a36Sopenharmony_ci * inode_to_wb_and_lock_list - determine an inode's wb and lock it
33562306a36Sopenharmony_ci * @inode: inode of interest
33662306a36Sopenharmony_ci *
33762306a36Sopenharmony_ci * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
33862306a36Sopenharmony_ci * on entry.
33962306a36Sopenharmony_ci */
34062306a36Sopenharmony_cistatic struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
34162306a36Sopenharmony_ci	__acquires(&wb->list_lock)
34262306a36Sopenharmony_ci{
34362306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
34462306a36Sopenharmony_ci	return locked_inode_to_wb_and_lock_list(inode);
34562306a36Sopenharmony_ci}
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_cistruct inode_switch_wbs_context {
34862306a36Sopenharmony_ci	struct rcu_work		work;
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci	/*
35162306a36Sopenharmony_ci	 * Multiple inodes can be switched at once.  The switching procedure
35262306a36Sopenharmony_ci	 * consists of two parts, separated by a RCU grace period.  To make
35362306a36Sopenharmony_ci	 * sure that the second part is executed for each inode gone through
35462306a36Sopenharmony_ci	 * the first part, all inode pointers are placed into a NULL-terminated
35562306a36Sopenharmony_ci	 * array embedded into struct inode_switch_wbs_context.  Otherwise
35662306a36Sopenharmony_ci	 * an inode could be left in a non-consistent state.
35762306a36Sopenharmony_ci	 */
35862306a36Sopenharmony_ci	struct bdi_writeback	*new_wb;
35962306a36Sopenharmony_ci	struct inode		*inodes[];
36062306a36Sopenharmony_ci};
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_cistatic void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
36362306a36Sopenharmony_ci{
36462306a36Sopenharmony_ci	down_write(&bdi->wb_switch_rwsem);
36562306a36Sopenharmony_ci}
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_cistatic void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
36862306a36Sopenharmony_ci{
36962306a36Sopenharmony_ci	up_write(&bdi->wb_switch_rwsem);
37062306a36Sopenharmony_ci}
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_cistatic bool inode_do_switch_wbs(struct inode *inode,
37362306a36Sopenharmony_ci				struct bdi_writeback *old_wb,
37462306a36Sopenharmony_ci				struct bdi_writeback *new_wb)
37562306a36Sopenharmony_ci{
37662306a36Sopenharmony_ci	struct address_space *mapping = inode->i_mapping;
37762306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, 0);
37862306a36Sopenharmony_ci	struct folio *folio;
37962306a36Sopenharmony_ci	bool switched = false;
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
38262306a36Sopenharmony_ci	xa_lock_irq(&mapping->i_pages);
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci	/*
38562306a36Sopenharmony_ci	 * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
38662306a36Sopenharmony_ci	 * path owns the inode and we shouldn't modify ->i_io_list.
38762306a36Sopenharmony_ci	 */
38862306a36Sopenharmony_ci	if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
38962306a36Sopenharmony_ci		goto skip_switch;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	trace_inode_switch_wbs(inode, old_wb, new_wb);
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	/*
39462306a36Sopenharmony_ci	 * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
39562306a36Sopenharmony_ci	 * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to
39662306a36Sopenharmony_ci	 * folios actually under writeback.
39762306a36Sopenharmony_ci	 */
39862306a36Sopenharmony_ci	xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
39962306a36Sopenharmony_ci		if (folio_test_dirty(folio)) {
40062306a36Sopenharmony_ci			long nr = folio_nr_pages(folio);
40162306a36Sopenharmony_ci			wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
40262306a36Sopenharmony_ci			wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
40362306a36Sopenharmony_ci		}
40462306a36Sopenharmony_ci	}
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci	xas_set(&xas, 0);
40762306a36Sopenharmony_ci	xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
40862306a36Sopenharmony_ci		long nr = folio_nr_pages(folio);
40962306a36Sopenharmony_ci		WARN_ON_ONCE(!folio_test_writeback(folio));
41062306a36Sopenharmony_ci		wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
41162306a36Sopenharmony_ci		wb_stat_mod(new_wb, WB_WRITEBACK, nr);
41262306a36Sopenharmony_ci	}
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
41562306a36Sopenharmony_ci		atomic_dec(&old_wb->writeback_inodes);
41662306a36Sopenharmony_ci		atomic_inc(&new_wb->writeback_inodes);
41762306a36Sopenharmony_ci	}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	wb_get(new_wb);
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	/*
42262306a36Sopenharmony_ci	 * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
42362306a36Sopenharmony_ci	 * the specific list @inode was on is ignored and the @inode is put on
42462306a36Sopenharmony_ci	 * ->b_dirty which is always correct including from ->b_dirty_time.
42562306a36Sopenharmony_ci	 * The transfer preserves @inode->dirtied_when ordering.  If the @inode
42662306a36Sopenharmony_ci	 * was clean, it means it was on the b_attached list, so move it onto
42762306a36Sopenharmony_ci	 * the b_attached list of @new_wb.
42862306a36Sopenharmony_ci	 */
42962306a36Sopenharmony_ci	if (!list_empty(&inode->i_io_list)) {
43062306a36Sopenharmony_ci		inode->i_wb = new_wb;
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci		if (inode->i_state & I_DIRTY_ALL) {
43362306a36Sopenharmony_ci			struct inode *pos;
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_ci			list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
43662306a36Sopenharmony_ci				if (time_after_eq(inode->dirtied_when,
43762306a36Sopenharmony_ci						  pos->dirtied_when))
43862306a36Sopenharmony_ci					break;
43962306a36Sopenharmony_ci			inode_io_list_move_locked(inode, new_wb,
44062306a36Sopenharmony_ci						  pos->i_io_list.prev);
44162306a36Sopenharmony_ci		} else {
44262306a36Sopenharmony_ci			inode_cgwb_move_to_attached(inode, new_wb);
44362306a36Sopenharmony_ci		}
44462306a36Sopenharmony_ci	} else {
44562306a36Sopenharmony_ci		inode->i_wb = new_wb;
44662306a36Sopenharmony_ci	}
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	/* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
44962306a36Sopenharmony_ci	inode->i_wb_frn_winner = 0;
45062306a36Sopenharmony_ci	inode->i_wb_frn_avg_time = 0;
45162306a36Sopenharmony_ci	inode->i_wb_frn_history = 0;
45262306a36Sopenharmony_ci	switched = true;
45362306a36Sopenharmony_ciskip_switch:
45462306a36Sopenharmony_ci	/*
45562306a36Sopenharmony_ci	 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
45662306a36Sopenharmony_ci	 * ensures that the new wb is visible if they see !I_WB_SWITCH.
45762306a36Sopenharmony_ci	 */
45862306a36Sopenharmony_ci	smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	xa_unlock_irq(&mapping->i_pages);
46162306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci	return switched;
46462306a36Sopenharmony_ci}
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_cistatic void inode_switch_wbs_work_fn(struct work_struct *work)
46762306a36Sopenharmony_ci{
46862306a36Sopenharmony_ci	struct inode_switch_wbs_context *isw =
46962306a36Sopenharmony_ci		container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
47062306a36Sopenharmony_ci	struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
47162306a36Sopenharmony_ci	struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
47262306a36Sopenharmony_ci	struct bdi_writeback *new_wb = isw->new_wb;
47362306a36Sopenharmony_ci	unsigned long nr_switched = 0;
47462306a36Sopenharmony_ci	struct inode **inodep;
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci	/*
47762306a36Sopenharmony_ci	 * If @inode switches cgwb membership while sync_inodes_sb() is
47862306a36Sopenharmony_ci	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
47962306a36Sopenharmony_ci	 */
48062306a36Sopenharmony_ci	down_read(&bdi->wb_switch_rwsem);
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci	/*
48362306a36Sopenharmony_ci	 * By the time control reaches here, RCU grace period has passed
48462306a36Sopenharmony_ci	 * since I_WB_SWITCH assertion and all wb stat update transactions
48562306a36Sopenharmony_ci	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
48662306a36Sopenharmony_ci	 * synchronizing against the i_pages lock.
48762306a36Sopenharmony_ci	 *
48862306a36Sopenharmony_ci	 * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
48962306a36Sopenharmony_ci	 * gives us exclusion against all wb related operations on @inode
49062306a36Sopenharmony_ci	 * including IO list manipulations and stat updates.
49162306a36Sopenharmony_ci	 */
49262306a36Sopenharmony_ci	if (old_wb < new_wb) {
49362306a36Sopenharmony_ci		spin_lock(&old_wb->list_lock);
49462306a36Sopenharmony_ci		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
49562306a36Sopenharmony_ci	} else {
49662306a36Sopenharmony_ci		spin_lock(&new_wb->list_lock);
49762306a36Sopenharmony_ci		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
49862306a36Sopenharmony_ci	}
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	for (inodep = isw->inodes; *inodep; inodep++) {
50162306a36Sopenharmony_ci		WARN_ON_ONCE((*inodep)->i_wb != old_wb);
50262306a36Sopenharmony_ci		if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
50362306a36Sopenharmony_ci			nr_switched++;
50462306a36Sopenharmony_ci	}
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	spin_unlock(&new_wb->list_lock);
50762306a36Sopenharmony_ci	spin_unlock(&old_wb->list_lock);
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	up_read(&bdi->wb_switch_rwsem);
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci	if (nr_switched) {
51262306a36Sopenharmony_ci		wb_wakeup(new_wb);
51362306a36Sopenharmony_ci		wb_put_many(old_wb, nr_switched);
51462306a36Sopenharmony_ci	}
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci	for (inodep = isw->inodes; *inodep; inodep++)
51762306a36Sopenharmony_ci		iput(*inodep);
51862306a36Sopenharmony_ci	wb_put(new_wb);
51962306a36Sopenharmony_ci	kfree(isw);
52062306a36Sopenharmony_ci	atomic_dec(&isw_nr_in_flight);
52162306a36Sopenharmony_ci}
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_cistatic bool inode_prepare_wbs_switch(struct inode *inode,
52462306a36Sopenharmony_ci				     struct bdi_writeback *new_wb)
52562306a36Sopenharmony_ci{
52662306a36Sopenharmony_ci	/*
52762306a36Sopenharmony_ci	 * Paired with smp_mb() in cgroup_writeback_umount().
52862306a36Sopenharmony_ci	 * isw_nr_in_flight must be increased before checking SB_ACTIVE and
52962306a36Sopenharmony_ci	 * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
53062306a36Sopenharmony_ci	 * in cgroup_writeback_umount() and the isw_wq will be not flushed.
53162306a36Sopenharmony_ci	 */
53262306a36Sopenharmony_ci	smp_mb();
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci	if (IS_DAX(inode))
53562306a36Sopenharmony_ci		return false;
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	/* while holding I_WB_SWITCH, no one else can update the association */
53862306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
53962306a36Sopenharmony_ci	if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
54062306a36Sopenharmony_ci	    inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
54162306a36Sopenharmony_ci	    inode_to_wb(inode) == new_wb) {
54262306a36Sopenharmony_ci		spin_unlock(&inode->i_lock);
54362306a36Sopenharmony_ci		return false;
54462306a36Sopenharmony_ci	}
54562306a36Sopenharmony_ci	inode->i_state |= I_WB_SWITCH;
54662306a36Sopenharmony_ci	__iget(inode);
54762306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci	return true;
55062306a36Sopenharmony_ci}
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci/**
55362306a36Sopenharmony_ci * inode_switch_wbs - change the wb association of an inode
55462306a36Sopenharmony_ci * @inode: target inode
55562306a36Sopenharmony_ci * @new_wb_id: ID of the new wb
55662306a36Sopenharmony_ci *
55762306a36Sopenharmony_ci * Switch @inode's wb association to the wb identified by @new_wb_id.  The
55862306a36Sopenharmony_ci * switching is performed asynchronously and may fail silently.
55962306a36Sopenharmony_ci */
56062306a36Sopenharmony_cistatic void inode_switch_wbs(struct inode *inode, int new_wb_id)
56162306a36Sopenharmony_ci{
56262306a36Sopenharmony_ci	struct backing_dev_info *bdi = inode_to_bdi(inode);
56362306a36Sopenharmony_ci	struct cgroup_subsys_state *memcg_css;
56462306a36Sopenharmony_ci	struct inode_switch_wbs_context *isw;
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci	/* noop if seems to be already in progress */
56762306a36Sopenharmony_ci	if (inode->i_state & I_WB_SWITCH)
56862306a36Sopenharmony_ci		return;
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	/* avoid queueing a new switch if too many are already in flight */
57162306a36Sopenharmony_ci	if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
57262306a36Sopenharmony_ci		return;
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci	isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
57562306a36Sopenharmony_ci	if (!isw)
57662306a36Sopenharmony_ci		return;
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	atomic_inc(&isw_nr_in_flight);
57962306a36Sopenharmony_ci
58062306a36Sopenharmony_ci	/* find and pin the new wb */
58162306a36Sopenharmony_ci	rcu_read_lock();
58262306a36Sopenharmony_ci	memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
58362306a36Sopenharmony_ci	if (memcg_css && !css_tryget(memcg_css))
58462306a36Sopenharmony_ci		memcg_css = NULL;
58562306a36Sopenharmony_ci	rcu_read_unlock();
58662306a36Sopenharmony_ci	if (!memcg_css)
58762306a36Sopenharmony_ci		goto out_free;
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
59062306a36Sopenharmony_ci	css_put(memcg_css);
59162306a36Sopenharmony_ci	if (!isw->new_wb)
59262306a36Sopenharmony_ci		goto out_free;
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci	if (!inode_prepare_wbs_switch(inode, isw->new_wb))
59562306a36Sopenharmony_ci		goto out_free;
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	isw->inodes[0] = inode;
59862306a36Sopenharmony_ci
59962306a36Sopenharmony_ci	/*
60062306a36Sopenharmony_ci	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
60162306a36Sopenharmony_ci	 * the RCU protected stat update paths to grab the i_page
60262306a36Sopenharmony_ci	 * lock so that stat transfer can synchronize against them.
60362306a36Sopenharmony_ci	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
60462306a36Sopenharmony_ci	 */
60562306a36Sopenharmony_ci	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
60662306a36Sopenharmony_ci	queue_rcu_work(isw_wq, &isw->work);
60762306a36Sopenharmony_ci	return;
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ciout_free:
61062306a36Sopenharmony_ci	atomic_dec(&isw_nr_in_flight);
61162306a36Sopenharmony_ci	if (isw->new_wb)
61262306a36Sopenharmony_ci		wb_put(isw->new_wb);
61362306a36Sopenharmony_ci	kfree(isw);
61462306a36Sopenharmony_ci}
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_cistatic bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
61762306a36Sopenharmony_ci				   struct list_head *list, int *nr)
61862306a36Sopenharmony_ci{
61962306a36Sopenharmony_ci	struct inode *inode;
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	list_for_each_entry(inode, list, i_io_list) {
62262306a36Sopenharmony_ci		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
62362306a36Sopenharmony_ci			continue;
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_ci		isw->inodes[*nr] = inode;
62662306a36Sopenharmony_ci		(*nr)++;
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci		if (*nr >= WB_MAX_INODES_PER_ISW - 1)
62962306a36Sopenharmony_ci			return true;
63062306a36Sopenharmony_ci	}
63162306a36Sopenharmony_ci	return false;
63262306a36Sopenharmony_ci}
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci/**
63562306a36Sopenharmony_ci * cleanup_offline_cgwb - detach associated inodes
63662306a36Sopenharmony_ci * @wb: target wb
63762306a36Sopenharmony_ci *
63862306a36Sopenharmony_ci * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
63962306a36Sopenharmony_ci * to eventually release the dying @wb.  Returns %true if not all inodes were
64062306a36Sopenharmony_ci * switched and the function has to be restarted.
64162306a36Sopenharmony_ci */
64262306a36Sopenharmony_cibool cleanup_offline_cgwb(struct bdi_writeback *wb)
64362306a36Sopenharmony_ci{
64462306a36Sopenharmony_ci	struct cgroup_subsys_state *memcg_css;
64562306a36Sopenharmony_ci	struct inode_switch_wbs_context *isw;
64662306a36Sopenharmony_ci	int nr;
64762306a36Sopenharmony_ci	bool restart = false;
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci	isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
65062306a36Sopenharmony_ci		      GFP_KERNEL);
65162306a36Sopenharmony_ci	if (!isw)
65262306a36Sopenharmony_ci		return restart;
65362306a36Sopenharmony_ci
65462306a36Sopenharmony_ci	atomic_inc(&isw_nr_in_flight);
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci	for (memcg_css = wb->memcg_css->parent; memcg_css;
65762306a36Sopenharmony_ci	     memcg_css = memcg_css->parent) {
65862306a36Sopenharmony_ci		isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
65962306a36Sopenharmony_ci		if (isw->new_wb)
66062306a36Sopenharmony_ci			break;
66162306a36Sopenharmony_ci	}
66262306a36Sopenharmony_ci	if (unlikely(!isw->new_wb))
66362306a36Sopenharmony_ci		isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	nr = 0;
66662306a36Sopenharmony_ci	spin_lock(&wb->list_lock);
66762306a36Sopenharmony_ci	/*
66862306a36Sopenharmony_ci	 * In addition to the inodes that have completed writeback, also switch
66962306a36Sopenharmony_ci	 * cgwbs for those inodes only with dirty timestamps. Otherwise, those
67062306a36Sopenharmony_ci	 * inodes won't be written back for a long time when lazytime is
67162306a36Sopenharmony_ci	 * enabled, and thus pinning the dying cgwbs. It won't break the
67262306a36Sopenharmony_ci	 * bandwidth restrictions, as writeback of inode metadata is not
67362306a36Sopenharmony_ci	 * accounted for.
67462306a36Sopenharmony_ci	 */
67562306a36Sopenharmony_ci	restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
67662306a36Sopenharmony_ci	if (!restart)
67762306a36Sopenharmony_ci		restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
67862306a36Sopenharmony_ci	spin_unlock(&wb->list_lock);
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci	/* no attached inodes? bail out */
68162306a36Sopenharmony_ci	if (nr == 0) {
68262306a36Sopenharmony_ci		atomic_dec(&isw_nr_in_flight);
68362306a36Sopenharmony_ci		wb_put(isw->new_wb);
68462306a36Sopenharmony_ci		kfree(isw);
68562306a36Sopenharmony_ci		return restart;
68662306a36Sopenharmony_ci	}
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	/*
68962306a36Sopenharmony_ci	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
69062306a36Sopenharmony_ci	 * the RCU protected stat update paths to grab the i_page
69162306a36Sopenharmony_ci	 * lock so that stat transfer can synchronize against them.
69262306a36Sopenharmony_ci	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
69362306a36Sopenharmony_ci	 */
69462306a36Sopenharmony_ci	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
69562306a36Sopenharmony_ci	queue_rcu_work(isw_wq, &isw->work);
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci	return restart;
69862306a36Sopenharmony_ci}
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci/**
70162306a36Sopenharmony_ci * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
70262306a36Sopenharmony_ci * @wbc: writeback_control of interest
70362306a36Sopenharmony_ci * @inode: target inode
70462306a36Sopenharmony_ci *
70562306a36Sopenharmony_ci * @inode is locked and about to be written back under the control of @wbc.
70662306a36Sopenharmony_ci * Record @inode's writeback context into @wbc and unlock the i_lock.  On
70762306a36Sopenharmony_ci * writeback completion, wbc_detach_inode() should be called.  This is used
70862306a36Sopenharmony_ci * to track the cgroup writeback context.
70962306a36Sopenharmony_ci */
71062306a36Sopenharmony_civoid wbc_attach_and_unlock_inode(struct writeback_control *wbc,
71162306a36Sopenharmony_ci				 struct inode *inode)
71262306a36Sopenharmony_ci{
71362306a36Sopenharmony_ci	if (!inode_cgwb_enabled(inode)) {
71462306a36Sopenharmony_ci		spin_unlock(&inode->i_lock);
71562306a36Sopenharmony_ci		return;
71662306a36Sopenharmony_ci	}
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci	wbc->wb = inode_to_wb(inode);
71962306a36Sopenharmony_ci	wbc->inode = inode;
72062306a36Sopenharmony_ci
72162306a36Sopenharmony_ci	wbc->wb_id = wbc->wb->memcg_css->id;
72262306a36Sopenharmony_ci	wbc->wb_lcand_id = inode->i_wb_frn_winner;
72362306a36Sopenharmony_ci	wbc->wb_tcand_id = 0;
72462306a36Sopenharmony_ci	wbc->wb_bytes = 0;
72562306a36Sopenharmony_ci	wbc->wb_lcand_bytes = 0;
72662306a36Sopenharmony_ci	wbc->wb_tcand_bytes = 0;
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci	wb_get(wbc->wb);
72962306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci	/*
73262306a36Sopenharmony_ci	 * A dying wb indicates that either the blkcg associated with the
73362306a36Sopenharmony_ci	 * memcg changed or the associated memcg is dying.  In the first
73462306a36Sopenharmony_ci	 * case, a replacement wb should already be available and we should
73562306a36Sopenharmony_ci	 * refresh the wb immediately.  In the second case, trying to
73662306a36Sopenharmony_ci	 * refresh will keep failing.
73762306a36Sopenharmony_ci	 */
73862306a36Sopenharmony_ci	if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
73962306a36Sopenharmony_ci		inode_switch_wbs(inode, wbc->wb_id);
74062306a36Sopenharmony_ci}
74162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci/**
74462306a36Sopenharmony_ci * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
74562306a36Sopenharmony_ci * @wbc: writeback_control of the just finished writeback
74662306a36Sopenharmony_ci *
74762306a36Sopenharmony_ci * To be called after a writeback attempt of an inode finishes and undoes
74862306a36Sopenharmony_ci * wbc_attach_and_unlock_inode().  Can be called under any context.
74962306a36Sopenharmony_ci *
75062306a36Sopenharmony_ci * As concurrent write sharing of an inode is expected to be very rare and
75162306a36Sopenharmony_ci * memcg only tracks page ownership on first-use basis severely confining
75262306a36Sopenharmony_ci * the usefulness of such sharing, cgroup writeback tracks ownership
75362306a36Sopenharmony_ci * per-inode.  While the support for concurrent write sharing of an inode
75462306a36Sopenharmony_ci * is deemed unnecessary, an inode being written to by different cgroups at
75562306a36Sopenharmony_ci * different points in time is a lot more common, and, more importantly,
75662306a36Sopenharmony_ci * charging only by first-use can too readily lead to grossly incorrect
75762306a36Sopenharmony_ci * behaviors (single foreign page can lead to gigabytes of writeback to be
75862306a36Sopenharmony_ci * incorrectly attributed).
75962306a36Sopenharmony_ci *
76062306a36Sopenharmony_ci * To resolve this issue, cgroup writeback detects the majority dirtier of
76162306a36Sopenharmony_ci * an inode and transfers the ownership to it.  To avoid unnecessary
76262306a36Sopenharmony_ci * oscillation, the detection mechanism keeps track of history and gives
76362306a36Sopenharmony_ci * out the switch verdict only if the foreign usage pattern is stable over
76462306a36Sopenharmony_ci * a certain amount of time and/or writeback attempts.
76562306a36Sopenharmony_ci *
76662306a36Sopenharmony_ci * On each writeback attempt, @wbc tries to detect the majority writer
76762306a36Sopenharmony_ci * using Boyer-Moore majority vote algorithm.  In addition to the byte
76862306a36Sopenharmony_ci * count from the majority voting, it also counts the bytes written for the
76962306a36Sopenharmony_ci * current wb and the last round's winner wb (max of last round's current
77062306a36Sopenharmony_ci * wb, the winner from two rounds ago, and the last round's majority
77162306a36Sopenharmony_ci * candidate).  Keeping track of the historical winner helps the algorithm
77262306a36Sopenharmony_ci * to semi-reliably detect the most active writer even when it's not the
77362306a36Sopenharmony_ci * absolute majority.
77462306a36Sopenharmony_ci *
77562306a36Sopenharmony_ci * Once the winner of the round is determined, whether the winner is
77662306a36Sopenharmony_ci * foreign or not and how much IO time the round consumed is recorded in
77762306a36Sopenharmony_ci * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
77862306a36Sopenharmony_ci * over a certain threshold, the switch verdict is given.
77962306a36Sopenharmony_ci */
78062306a36Sopenharmony_civoid wbc_detach_inode(struct writeback_control *wbc)
78162306a36Sopenharmony_ci{
78262306a36Sopenharmony_ci	struct bdi_writeback *wb = wbc->wb;
78362306a36Sopenharmony_ci	struct inode *inode = wbc->inode;
78462306a36Sopenharmony_ci	unsigned long avg_time, max_bytes, max_time;
78562306a36Sopenharmony_ci	u16 history;
78662306a36Sopenharmony_ci	int max_id;
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci	if (!wb)
78962306a36Sopenharmony_ci		return;
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci	history = inode->i_wb_frn_history;
79262306a36Sopenharmony_ci	avg_time = inode->i_wb_frn_avg_time;
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci	/* pick the winner of this round */
79562306a36Sopenharmony_ci	if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
79662306a36Sopenharmony_ci	    wbc->wb_bytes >= wbc->wb_tcand_bytes) {
79762306a36Sopenharmony_ci		max_id = wbc->wb_id;
79862306a36Sopenharmony_ci		max_bytes = wbc->wb_bytes;
79962306a36Sopenharmony_ci	} else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
80062306a36Sopenharmony_ci		max_id = wbc->wb_lcand_id;
80162306a36Sopenharmony_ci		max_bytes = wbc->wb_lcand_bytes;
80262306a36Sopenharmony_ci	} else {
80362306a36Sopenharmony_ci		max_id = wbc->wb_tcand_id;
80462306a36Sopenharmony_ci		max_bytes = wbc->wb_tcand_bytes;
80562306a36Sopenharmony_ci	}
80662306a36Sopenharmony_ci
80762306a36Sopenharmony_ci	/*
80862306a36Sopenharmony_ci	 * Calculate the amount of IO time the winner consumed and fold it
80962306a36Sopenharmony_ci	 * into the running average kept per inode.  If the consumed IO
81062306a36Sopenharmony_ci	 * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
81162306a36Sopenharmony_ci	 * deciding whether to switch or not.  This is to prevent one-off
81262306a36Sopenharmony_ci	 * small dirtiers from skewing the verdict.
81362306a36Sopenharmony_ci	 */
81462306a36Sopenharmony_ci	max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
81562306a36Sopenharmony_ci				wb->avg_write_bandwidth);
81662306a36Sopenharmony_ci	if (avg_time)
81762306a36Sopenharmony_ci		avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
81862306a36Sopenharmony_ci			    (avg_time >> WB_FRN_TIME_AVG_SHIFT);
81962306a36Sopenharmony_ci	else
82062306a36Sopenharmony_ci		avg_time = max_time;	/* immediate catch up on first run */
82162306a36Sopenharmony_ci
82262306a36Sopenharmony_ci	if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
82362306a36Sopenharmony_ci		int slots;
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci		/*
82662306a36Sopenharmony_ci		 * The switch verdict is reached if foreign wb's consume
82762306a36Sopenharmony_ci		 * more than a certain proportion of IO time in a
82862306a36Sopenharmony_ci		 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
82962306a36Sopenharmony_ci		 * history mask where each bit represents one sixteenth of
83062306a36Sopenharmony_ci		 * the period.  Determine the number of slots to shift into
83162306a36Sopenharmony_ci		 * history from @max_time.
83262306a36Sopenharmony_ci		 */
83362306a36Sopenharmony_ci		slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
83462306a36Sopenharmony_ci			    (unsigned long)WB_FRN_HIST_MAX_SLOTS);
83562306a36Sopenharmony_ci		history <<= slots;
83662306a36Sopenharmony_ci		if (wbc->wb_id != max_id)
83762306a36Sopenharmony_ci			history |= (1U << slots) - 1;
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci		if (history)
84062306a36Sopenharmony_ci			trace_inode_foreign_history(inode, wbc, history);
84162306a36Sopenharmony_ci
84262306a36Sopenharmony_ci		/*
84362306a36Sopenharmony_ci		 * Switch if the current wb isn't the consistent winner.
84462306a36Sopenharmony_ci		 * If there are multiple closely competing dirtiers, the
84562306a36Sopenharmony_ci		 * inode may switch across them repeatedly over time, which
84662306a36Sopenharmony_ci		 * is okay.  The main goal is avoiding keeping an inode on
84762306a36Sopenharmony_ci		 * the wrong wb for an extended period of time.
84862306a36Sopenharmony_ci		 */
84962306a36Sopenharmony_ci		if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
85062306a36Sopenharmony_ci			inode_switch_wbs(inode, max_id);
85162306a36Sopenharmony_ci	}
85262306a36Sopenharmony_ci
85362306a36Sopenharmony_ci	/*
85462306a36Sopenharmony_ci	 * Multiple instances of this function may race to update the
85562306a36Sopenharmony_ci	 * following fields but we don't mind occassional inaccuracies.
85662306a36Sopenharmony_ci	 */
85762306a36Sopenharmony_ci	inode->i_wb_frn_winner = max_id;
85862306a36Sopenharmony_ci	inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
85962306a36Sopenharmony_ci	inode->i_wb_frn_history = history;
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_ci	wb_put(wbc->wb);
86262306a36Sopenharmony_ci	wbc->wb = NULL;
86362306a36Sopenharmony_ci}
86462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(wbc_detach_inode);
86562306a36Sopenharmony_ci
86662306a36Sopenharmony_ci/**
86762306a36Sopenharmony_ci * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
86862306a36Sopenharmony_ci * @wbc: writeback_control of the writeback in progress
86962306a36Sopenharmony_ci * @page: page being written out
87062306a36Sopenharmony_ci * @bytes: number of bytes being written out
87162306a36Sopenharmony_ci *
87262306a36Sopenharmony_ci * @bytes from @page are about to written out during the writeback
87362306a36Sopenharmony_ci * controlled by @wbc.  Keep the book for foreign inode detection.  See
87462306a36Sopenharmony_ci * wbc_detach_inode().
87562306a36Sopenharmony_ci */
87662306a36Sopenharmony_civoid wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
87762306a36Sopenharmony_ci			      size_t bytes)
87862306a36Sopenharmony_ci{
87962306a36Sopenharmony_ci	struct folio *folio;
88062306a36Sopenharmony_ci	struct cgroup_subsys_state *css;
88162306a36Sopenharmony_ci	int id;
88262306a36Sopenharmony_ci
88362306a36Sopenharmony_ci	/*
88462306a36Sopenharmony_ci	 * pageout() path doesn't attach @wbc to the inode being written
88562306a36Sopenharmony_ci	 * out.  This is intentional as we don't want the function to block
88662306a36Sopenharmony_ci	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
88762306a36Sopenharmony_ci	 * regular writeback instead of writing things out itself.
88862306a36Sopenharmony_ci	 */
88962306a36Sopenharmony_ci	if (!wbc->wb || wbc->no_cgroup_owner)
89062306a36Sopenharmony_ci		return;
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci	folio = page_folio(page);
89362306a36Sopenharmony_ci	css = mem_cgroup_css_from_folio(folio);
89462306a36Sopenharmony_ci	/* dead cgroups shouldn't contribute to inode ownership arbitration */
89562306a36Sopenharmony_ci	if (!(css->flags & CSS_ONLINE))
89662306a36Sopenharmony_ci		return;
89762306a36Sopenharmony_ci
89862306a36Sopenharmony_ci	id = css->id;
89962306a36Sopenharmony_ci
90062306a36Sopenharmony_ci	if (id == wbc->wb_id) {
90162306a36Sopenharmony_ci		wbc->wb_bytes += bytes;
90262306a36Sopenharmony_ci		return;
90362306a36Sopenharmony_ci	}
90462306a36Sopenharmony_ci
90562306a36Sopenharmony_ci	if (id == wbc->wb_lcand_id)
90662306a36Sopenharmony_ci		wbc->wb_lcand_bytes += bytes;
90762306a36Sopenharmony_ci
90862306a36Sopenharmony_ci	/* Boyer-Moore majority vote algorithm */
90962306a36Sopenharmony_ci	if (!wbc->wb_tcand_bytes)
91062306a36Sopenharmony_ci		wbc->wb_tcand_id = id;
91162306a36Sopenharmony_ci	if (id == wbc->wb_tcand_id)
91262306a36Sopenharmony_ci		wbc->wb_tcand_bytes += bytes;
91362306a36Sopenharmony_ci	else
91462306a36Sopenharmony_ci		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
91562306a36Sopenharmony_ci}
91662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
91762306a36Sopenharmony_ci
91862306a36Sopenharmony_ci/**
91962306a36Sopenharmony_ci * wb_split_bdi_pages - split nr_pages to write according to bandwidth
92062306a36Sopenharmony_ci * @wb: target bdi_writeback to split @nr_pages to
92162306a36Sopenharmony_ci * @nr_pages: number of pages to write for the whole bdi
92262306a36Sopenharmony_ci *
92362306a36Sopenharmony_ci * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
92462306a36Sopenharmony_ci * relation to the total write bandwidth of all wb's w/ dirty inodes on
92562306a36Sopenharmony_ci * @wb->bdi.
92662306a36Sopenharmony_ci */
92762306a36Sopenharmony_cistatic long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
92862306a36Sopenharmony_ci{
92962306a36Sopenharmony_ci	unsigned long this_bw = wb->avg_write_bandwidth;
93062306a36Sopenharmony_ci	unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci	if (nr_pages == LONG_MAX)
93362306a36Sopenharmony_ci		return LONG_MAX;
93462306a36Sopenharmony_ci
93562306a36Sopenharmony_ci	/*
93662306a36Sopenharmony_ci	 * This may be called on clean wb's and proportional distribution
93762306a36Sopenharmony_ci	 * may not make sense, just use the original @nr_pages in those
93862306a36Sopenharmony_ci	 * cases.  In general, we wanna err on the side of writing more.
93962306a36Sopenharmony_ci	 */
94062306a36Sopenharmony_ci	if (!tot_bw || this_bw >= tot_bw)
94162306a36Sopenharmony_ci		return nr_pages;
94262306a36Sopenharmony_ci	else
94362306a36Sopenharmony_ci		return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
94462306a36Sopenharmony_ci}
94562306a36Sopenharmony_ci
94662306a36Sopenharmony_ci/**
94762306a36Sopenharmony_ci * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
94862306a36Sopenharmony_ci * @bdi: target backing_dev_info
94962306a36Sopenharmony_ci * @base_work: wb_writeback_work to issue
95062306a36Sopenharmony_ci * @skip_if_busy: skip wb's which already have writeback in progress
95162306a36Sopenharmony_ci *
95262306a36Sopenharmony_ci * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
95362306a36Sopenharmony_ci * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
95462306a36Sopenharmony_ci * distributed to the busy wbs according to each wb's proportion in the
95562306a36Sopenharmony_ci * total active write bandwidth of @bdi.
95662306a36Sopenharmony_ci */
95762306a36Sopenharmony_cistatic void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
95862306a36Sopenharmony_ci				  struct wb_writeback_work *base_work,
95962306a36Sopenharmony_ci				  bool skip_if_busy)
96062306a36Sopenharmony_ci{
96162306a36Sopenharmony_ci	struct bdi_writeback *last_wb = NULL;
96262306a36Sopenharmony_ci	struct bdi_writeback *wb = list_entry(&bdi->wb_list,
96362306a36Sopenharmony_ci					      struct bdi_writeback, bdi_node);
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_ci	might_sleep();
96662306a36Sopenharmony_cirestart:
96762306a36Sopenharmony_ci	rcu_read_lock();
96862306a36Sopenharmony_ci	list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
96962306a36Sopenharmony_ci		DEFINE_WB_COMPLETION(fallback_work_done, bdi);
97062306a36Sopenharmony_ci		struct wb_writeback_work fallback_work;
97162306a36Sopenharmony_ci		struct wb_writeback_work *work;
97262306a36Sopenharmony_ci		long nr_pages;
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci		if (last_wb) {
97562306a36Sopenharmony_ci			wb_put(last_wb);
97662306a36Sopenharmony_ci			last_wb = NULL;
97762306a36Sopenharmony_ci		}
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci		/* SYNC_ALL writes out I_DIRTY_TIME too */
98062306a36Sopenharmony_ci		if (!wb_has_dirty_io(wb) &&
98162306a36Sopenharmony_ci		    (base_work->sync_mode == WB_SYNC_NONE ||
98262306a36Sopenharmony_ci		     list_empty(&wb->b_dirty_time)))
98362306a36Sopenharmony_ci			continue;
98462306a36Sopenharmony_ci		if (skip_if_busy && writeback_in_progress(wb))
98562306a36Sopenharmony_ci			continue;
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci		nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
98862306a36Sopenharmony_ci
98962306a36Sopenharmony_ci		work = kmalloc(sizeof(*work), GFP_ATOMIC);
99062306a36Sopenharmony_ci		if (work) {
99162306a36Sopenharmony_ci			*work = *base_work;
99262306a36Sopenharmony_ci			work->nr_pages = nr_pages;
99362306a36Sopenharmony_ci			work->auto_free = 1;
99462306a36Sopenharmony_ci			wb_queue_work(wb, work);
99562306a36Sopenharmony_ci			continue;
99662306a36Sopenharmony_ci		}
99762306a36Sopenharmony_ci
99862306a36Sopenharmony_ci		/*
99962306a36Sopenharmony_ci		 * If wb_tryget fails, the wb has been shutdown, skip it.
100062306a36Sopenharmony_ci		 *
100162306a36Sopenharmony_ci		 * Pin @wb so that it stays on @bdi->wb_list.  This allows
100262306a36Sopenharmony_ci		 * continuing iteration from @wb after dropping and
100362306a36Sopenharmony_ci		 * regrabbing rcu read lock.
100462306a36Sopenharmony_ci		 */
100562306a36Sopenharmony_ci		if (!wb_tryget(wb))
100662306a36Sopenharmony_ci			continue;
100762306a36Sopenharmony_ci
100862306a36Sopenharmony_ci		/* alloc failed, execute synchronously using on-stack fallback */
100962306a36Sopenharmony_ci		work = &fallback_work;
101062306a36Sopenharmony_ci		*work = *base_work;
101162306a36Sopenharmony_ci		work->nr_pages = nr_pages;
101262306a36Sopenharmony_ci		work->auto_free = 0;
101362306a36Sopenharmony_ci		work->done = &fallback_work_done;
101462306a36Sopenharmony_ci
101562306a36Sopenharmony_ci		wb_queue_work(wb, work);
101662306a36Sopenharmony_ci		last_wb = wb;
101762306a36Sopenharmony_ci
101862306a36Sopenharmony_ci		rcu_read_unlock();
101962306a36Sopenharmony_ci		wb_wait_for_completion(&fallback_work_done);
102062306a36Sopenharmony_ci		goto restart;
102162306a36Sopenharmony_ci	}
102262306a36Sopenharmony_ci	rcu_read_unlock();
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_ci	if (last_wb)
102562306a36Sopenharmony_ci		wb_put(last_wb);
102662306a36Sopenharmony_ci}
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_ci/**
102962306a36Sopenharmony_ci * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
103062306a36Sopenharmony_ci * @bdi_id: target bdi id
103162306a36Sopenharmony_ci * @memcg_id: target memcg css id
103262306a36Sopenharmony_ci * @reason: reason why some writeback work initiated
103362306a36Sopenharmony_ci * @done: target wb_completion
103462306a36Sopenharmony_ci *
103562306a36Sopenharmony_ci * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
103662306a36Sopenharmony_ci * with the specified parameters.
103762306a36Sopenharmony_ci */
103862306a36Sopenharmony_ciint cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
103962306a36Sopenharmony_ci			   enum wb_reason reason, struct wb_completion *done)
104062306a36Sopenharmony_ci{
104162306a36Sopenharmony_ci	struct backing_dev_info *bdi;
104262306a36Sopenharmony_ci	struct cgroup_subsys_state *memcg_css;
104362306a36Sopenharmony_ci	struct bdi_writeback *wb;
104462306a36Sopenharmony_ci	struct wb_writeback_work *work;
104562306a36Sopenharmony_ci	unsigned long dirty;
104662306a36Sopenharmony_ci	int ret;
104762306a36Sopenharmony_ci
104862306a36Sopenharmony_ci	/* lookup bdi and memcg */
104962306a36Sopenharmony_ci	bdi = bdi_get_by_id(bdi_id);
105062306a36Sopenharmony_ci	if (!bdi)
105162306a36Sopenharmony_ci		return -ENOENT;
105262306a36Sopenharmony_ci
105362306a36Sopenharmony_ci	rcu_read_lock();
105462306a36Sopenharmony_ci	memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
105562306a36Sopenharmony_ci	if (memcg_css && !css_tryget(memcg_css))
105662306a36Sopenharmony_ci		memcg_css = NULL;
105762306a36Sopenharmony_ci	rcu_read_unlock();
105862306a36Sopenharmony_ci	if (!memcg_css) {
105962306a36Sopenharmony_ci		ret = -ENOENT;
106062306a36Sopenharmony_ci		goto out_bdi_put;
106162306a36Sopenharmony_ci	}
106262306a36Sopenharmony_ci
106362306a36Sopenharmony_ci	/*
106462306a36Sopenharmony_ci	 * And find the associated wb.  If the wb isn't there already
106562306a36Sopenharmony_ci	 * there's nothing to flush, don't create one.
106662306a36Sopenharmony_ci	 */
106762306a36Sopenharmony_ci	wb = wb_get_lookup(bdi, memcg_css);
106862306a36Sopenharmony_ci	if (!wb) {
106962306a36Sopenharmony_ci		ret = -ENOENT;
107062306a36Sopenharmony_ci		goto out_css_put;
107162306a36Sopenharmony_ci	}
107262306a36Sopenharmony_ci
107362306a36Sopenharmony_ci	/*
107462306a36Sopenharmony_ci	 * The caller is attempting to write out most of
107562306a36Sopenharmony_ci	 * the currently dirty pages.  Let's take the current dirty page
107662306a36Sopenharmony_ci	 * count and inflate it by 25% which should be large enough to
107762306a36Sopenharmony_ci	 * flush out most dirty pages while avoiding getting livelocked by
107862306a36Sopenharmony_ci	 * concurrent dirtiers.
107962306a36Sopenharmony_ci	 *
108062306a36Sopenharmony_ci	 * BTW the memcg stats are flushed periodically and this is best-effort
108162306a36Sopenharmony_ci	 * estimation, so some potential error is ok.
108262306a36Sopenharmony_ci	 */
108362306a36Sopenharmony_ci	dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
108462306a36Sopenharmony_ci	dirty = dirty * 10 / 8;
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_ci	/* issue the writeback work */
108762306a36Sopenharmony_ci	work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
108862306a36Sopenharmony_ci	if (work) {
108962306a36Sopenharmony_ci		work->nr_pages = dirty;
109062306a36Sopenharmony_ci		work->sync_mode = WB_SYNC_NONE;
109162306a36Sopenharmony_ci		work->range_cyclic = 1;
109262306a36Sopenharmony_ci		work->reason = reason;
109362306a36Sopenharmony_ci		work->done = done;
109462306a36Sopenharmony_ci		work->auto_free = 1;
109562306a36Sopenharmony_ci		wb_queue_work(wb, work);
109662306a36Sopenharmony_ci		ret = 0;
109762306a36Sopenharmony_ci	} else {
109862306a36Sopenharmony_ci		ret = -ENOMEM;
109962306a36Sopenharmony_ci	}
110062306a36Sopenharmony_ci
110162306a36Sopenharmony_ci	wb_put(wb);
110262306a36Sopenharmony_ciout_css_put:
110362306a36Sopenharmony_ci	css_put(memcg_css);
110462306a36Sopenharmony_ciout_bdi_put:
110562306a36Sopenharmony_ci	bdi_put(bdi);
110662306a36Sopenharmony_ci	return ret;
110762306a36Sopenharmony_ci}
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci/**
111062306a36Sopenharmony_ci * cgroup_writeback_umount - flush inode wb switches for umount
111162306a36Sopenharmony_ci *
111262306a36Sopenharmony_ci * This function is called when a super_block is about to be destroyed and
111362306a36Sopenharmony_ci * flushes in-flight inode wb switches.  An inode wb switch goes through
111462306a36Sopenharmony_ci * RCU and then workqueue, so the two need to be flushed in order to ensure
111562306a36Sopenharmony_ci * that all previously scheduled switches are finished.  As wb switches are
111662306a36Sopenharmony_ci * rare occurrences and synchronize_rcu() can take a while, perform
111762306a36Sopenharmony_ci * flushing iff wb switches are in flight.
111862306a36Sopenharmony_ci */
111962306a36Sopenharmony_civoid cgroup_writeback_umount(void)
112062306a36Sopenharmony_ci{
112162306a36Sopenharmony_ci	/*
112262306a36Sopenharmony_ci	 * SB_ACTIVE should be reliably cleared before checking
112362306a36Sopenharmony_ci	 * isw_nr_in_flight, see generic_shutdown_super().
112462306a36Sopenharmony_ci	 */
112562306a36Sopenharmony_ci	smp_mb();
112662306a36Sopenharmony_ci
112762306a36Sopenharmony_ci	if (atomic_read(&isw_nr_in_flight)) {
112862306a36Sopenharmony_ci		/*
112962306a36Sopenharmony_ci		 * Use rcu_barrier() to wait for all pending callbacks to
113062306a36Sopenharmony_ci		 * ensure that all in-flight wb switches are in the workqueue.
113162306a36Sopenharmony_ci		 */
113262306a36Sopenharmony_ci		rcu_barrier();
113362306a36Sopenharmony_ci		flush_workqueue(isw_wq);
113462306a36Sopenharmony_ci	}
113562306a36Sopenharmony_ci}
113662306a36Sopenharmony_ci
113762306a36Sopenharmony_cistatic int __init cgroup_writeback_init(void)
113862306a36Sopenharmony_ci{
113962306a36Sopenharmony_ci	isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
114062306a36Sopenharmony_ci	if (!isw_wq)
114162306a36Sopenharmony_ci		return -ENOMEM;
114262306a36Sopenharmony_ci	return 0;
114362306a36Sopenharmony_ci}
114462306a36Sopenharmony_cifs_initcall(cgroup_writeback_init);
114562306a36Sopenharmony_ci
114662306a36Sopenharmony_ci#else	/* CONFIG_CGROUP_WRITEBACK */
114762306a36Sopenharmony_ci
114862306a36Sopenharmony_cistatic void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
114962306a36Sopenharmony_cistatic void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_cistatic void inode_cgwb_move_to_attached(struct inode *inode,
115262306a36Sopenharmony_ci					struct bdi_writeback *wb)
115362306a36Sopenharmony_ci{
115462306a36Sopenharmony_ci	assert_spin_locked(&wb->list_lock);
115562306a36Sopenharmony_ci	assert_spin_locked(&inode->i_lock);
115662306a36Sopenharmony_ci	WARN_ON_ONCE(inode->i_state & I_FREEING);
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci	inode->i_state &= ~I_SYNC_QUEUED;
115962306a36Sopenharmony_ci	list_del_init(&inode->i_io_list);
116062306a36Sopenharmony_ci	wb_io_lists_depopulated(wb);
116162306a36Sopenharmony_ci}
116262306a36Sopenharmony_ci
116362306a36Sopenharmony_cistatic struct bdi_writeback *
116462306a36Sopenharmony_cilocked_inode_to_wb_and_lock_list(struct inode *inode)
116562306a36Sopenharmony_ci	__releases(&inode->i_lock)
116662306a36Sopenharmony_ci	__acquires(&wb->list_lock)
116762306a36Sopenharmony_ci{
116862306a36Sopenharmony_ci	struct bdi_writeback *wb = inode_to_wb(inode);
116962306a36Sopenharmony_ci
117062306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
117162306a36Sopenharmony_ci	spin_lock(&wb->list_lock);
117262306a36Sopenharmony_ci	return wb;
117362306a36Sopenharmony_ci}
117462306a36Sopenharmony_ci
117562306a36Sopenharmony_cistatic struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
117662306a36Sopenharmony_ci	__acquires(&wb->list_lock)
117762306a36Sopenharmony_ci{
117862306a36Sopenharmony_ci	struct bdi_writeback *wb = inode_to_wb(inode);
117962306a36Sopenharmony_ci
118062306a36Sopenharmony_ci	spin_lock(&wb->list_lock);
118162306a36Sopenharmony_ci	return wb;
118262306a36Sopenharmony_ci}
118362306a36Sopenharmony_ci
118462306a36Sopenharmony_cistatic long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
118562306a36Sopenharmony_ci{
118662306a36Sopenharmony_ci	return nr_pages;
118762306a36Sopenharmony_ci}
118862306a36Sopenharmony_ci
118962306a36Sopenharmony_cistatic void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
119062306a36Sopenharmony_ci				  struct wb_writeback_work *base_work,
119162306a36Sopenharmony_ci				  bool skip_if_busy)
119262306a36Sopenharmony_ci{
119362306a36Sopenharmony_ci	might_sleep();
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ci	if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
119662306a36Sopenharmony_ci		base_work->auto_free = 0;
119762306a36Sopenharmony_ci		wb_queue_work(&bdi->wb, base_work);
119862306a36Sopenharmony_ci	}
119962306a36Sopenharmony_ci}
120062306a36Sopenharmony_ci
120162306a36Sopenharmony_ci#endif	/* CONFIG_CGROUP_WRITEBACK */
120262306a36Sopenharmony_ci
120362306a36Sopenharmony_ci/*
120462306a36Sopenharmony_ci * Add in the number of potentially dirty inodes, because each inode
120562306a36Sopenharmony_ci * write can dirty pagecache in the underlying blockdev.
120662306a36Sopenharmony_ci */
120762306a36Sopenharmony_cistatic unsigned long get_nr_dirty_pages(void)
120862306a36Sopenharmony_ci{
120962306a36Sopenharmony_ci	return global_node_page_state(NR_FILE_DIRTY) +
121062306a36Sopenharmony_ci		get_nr_dirty_inodes();
121162306a36Sopenharmony_ci}
121262306a36Sopenharmony_ci
121362306a36Sopenharmony_cistatic void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
121462306a36Sopenharmony_ci{
121562306a36Sopenharmony_ci	if (!wb_has_dirty_io(wb))
121662306a36Sopenharmony_ci		return;
121762306a36Sopenharmony_ci
121862306a36Sopenharmony_ci	/*
121962306a36Sopenharmony_ci	 * All callers of this function want to start writeback of all
122062306a36Sopenharmony_ci	 * dirty pages. Places like vmscan can call this at a very
122162306a36Sopenharmony_ci	 * high frequency, causing pointless allocations of tons of
122262306a36Sopenharmony_ci	 * work items and keeping the flusher threads busy retrieving
122362306a36Sopenharmony_ci	 * that work. Ensure that we only allow one of them pending and
122462306a36Sopenharmony_ci	 * inflight at the time.
122562306a36Sopenharmony_ci	 */
122662306a36Sopenharmony_ci	if (test_bit(WB_start_all, &wb->state) ||
122762306a36Sopenharmony_ci	    test_and_set_bit(WB_start_all, &wb->state))
122862306a36Sopenharmony_ci		return;
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	wb->start_all_reason = reason;
123162306a36Sopenharmony_ci	wb_wakeup(wb);
123262306a36Sopenharmony_ci}
123362306a36Sopenharmony_ci
123462306a36Sopenharmony_ci/**
123562306a36Sopenharmony_ci * wb_start_background_writeback - start background writeback
123662306a36Sopenharmony_ci * @wb: bdi_writback to write from
123762306a36Sopenharmony_ci *
123862306a36Sopenharmony_ci * Description:
123962306a36Sopenharmony_ci *   This makes sure WB_SYNC_NONE background writeback happens. When
124062306a36Sopenharmony_ci *   this function returns, it is only guaranteed that for given wb
124162306a36Sopenharmony_ci *   some IO is happening if we are over background dirty threshold.
124262306a36Sopenharmony_ci *   Caller need not hold sb s_umount semaphore.
124362306a36Sopenharmony_ci */
124462306a36Sopenharmony_civoid wb_start_background_writeback(struct bdi_writeback *wb)
124562306a36Sopenharmony_ci{
124662306a36Sopenharmony_ci	/*
124762306a36Sopenharmony_ci	 * We just wake up the flusher thread. It will perform background
124862306a36Sopenharmony_ci	 * writeback as soon as there is no other work to do.
124962306a36Sopenharmony_ci	 */
125062306a36Sopenharmony_ci	trace_writeback_wake_background(wb);
125162306a36Sopenharmony_ci	wb_wakeup(wb);
125262306a36Sopenharmony_ci}
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_ci/*
125562306a36Sopenharmony_ci * Remove the inode from the writeback list it is on.
125662306a36Sopenharmony_ci */
125762306a36Sopenharmony_civoid inode_io_list_del(struct inode *inode)
125862306a36Sopenharmony_ci{
125962306a36Sopenharmony_ci	struct bdi_writeback *wb;
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci	wb = inode_to_wb_and_lock_list(inode);
126262306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
126362306a36Sopenharmony_ci
126462306a36Sopenharmony_ci	inode->i_state &= ~I_SYNC_QUEUED;
126562306a36Sopenharmony_ci	list_del_init(&inode->i_io_list);
126662306a36Sopenharmony_ci	wb_io_lists_depopulated(wb);
126762306a36Sopenharmony_ci
126862306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
126962306a36Sopenharmony_ci	spin_unlock(&wb->list_lock);
127062306a36Sopenharmony_ci}
127162306a36Sopenharmony_ciEXPORT_SYMBOL(inode_io_list_del);
127262306a36Sopenharmony_ci
127362306a36Sopenharmony_ci/*
127462306a36Sopenharmony_ci * mark an inode as under writeback on the sb
127562306a36Sopenharmony_ci */
127662306a36Sopenharmony_civoid sb_mark_inode_writeback(struct inode *inode)
127762306a36Sopenharmony_ci{
127862306a36Sopenharmony_ci	struct super_block *sb = inode->i_sb;
127962306a36Sopenharmony_ci	unsigned long flags;
128062306a36Sopenharmony_ci
128162306a36Sopenharmony_ci	if (list_empty(&inode->i_wb_list)) {
128262306a36Sopenharmony_ci		spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
128362306a36Sopenharmony_ci		if (list_empty(&inode->i_wb_list)) {
128462306a36Sopenharmony_ci			list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
128562306a36Sopenharmony_ci			trace_sb_mark_inode_writeback(inode);
128662306a36Sopenharmony_ci		}
128762306a36Sopenharmony_ci		spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
128862306a36Sopenharmony_ci	}
128962306a36Sopenharmony_ci}
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci/*
129262306a36Sopenharmony_ci * clear an inode as under writeback on the sb
129362306a36Sopenharmony_ci */
129462306a36Sopenharmony_civoid sb_clear_inode_writeback(struct inode *inode)
129562306a36Sopenharmony_ci{
129662306a36Sopenharmony_ci	struct super_block *sb = inode->i_sb;
129762306a36Sopenharmony_ci	unsigned long flags;
129862306a36Sopenharmony_ci
129962306a36Sopenharmony_ci	if (!list_empty(&inode->i_wb_list)) {
130062306a36Sopenharmony_ci		spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
130162306a36Sopenharmony_ci		if (!list_empty(&inode->i_wb_list)) {
130262306a36Sopenharmony_ci			list_del_init(&inode->i_wb_list);
130362306a36Sopenharmony_ci			trace_sb_clear_inode_writeback(inode);
130462306a36Sopenharmony_ci		}
130562306a36Sopenharmony_ci		spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
130662306a36Sopenharmony_ci	}
130762306a36Sopenharmony_ci}
130862306a36Sopenharmony_ci
130962306a36Sopenharmony_ci/*
131062306a36Sopenharmony_ci * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
131162306a36Sopenharmony_ci * furthest end of its superblock's dirty-inode list.
131262306a36Sopenharmony_ci *
131362306a36Sopenharmony_ci * Before stamping the inode's ->dirtied_when, we check to see whether it is
131462306a36Sopenharmony_ci * already the most-recently-dirtied inode on the b_dirty list.  If that is
131562306a36Sopenharmony_ci * the case then the inode must have been redirtied while it was being written
131662306a36Sopenharmony_ci * out and we don't reset its dirtied_when.
131762306a36Sopenharmony_ci */
131862306a36Sopenharmony_cistatic void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
131962306a36Sopenharmony_ci{
132062306a36Sopenharmony_ci	assert_spin_locked(&inode->i_lock);
132162306a36Sopenharmony_ci
132262306a36Sopenharmony_ci	inode->i_state &= ~I_SYNC_QUEUED;
132362306a36Sopenharmony_ci	/*
132462306a36Sopenharmony_ci	 * When the inode is being freed just don't bother with dirty list
132562306a36Sopenharmony_ci	 * tracking. Flush worker will ignore this inode anyway and it will
132662306a36Sopenharmony_ci	 * trigger assertions in inode_io_list_move_locked().
132762306a36Sopenharmony_ci	 */
132862306a36Sopenharmony_ci	if (inode->i_state & I_FREEING) {
132962306a36Sopenharmony_ci		list_del_init(&inode->i_io_list);
133062306a36Sopenharmony_ci		wb_io_lists_depopulated(wb);
133162306a36Sopenharmony_ci		return;
133262306a36Sopenharmony_ci	}
133362306a36Sopenharmony_ci	if (!list_empty(&wb->b_dirty)) {
133462306a36Sopenharmony_ci		struct inode *tail;
133562306a36Sopenharmony_ci
133662306a36Sopenharmony_ci		tail = wb_inode(wb->b_dirty.next);
133762306a36Sopenharmony_ci		if (time_before(inode->dirtied_when, tail->dirtied_when))
133862306a36Sopenharmony_ci			inode->dirtied_when = jiffies;
133962306a36Sopenharmony_ci	}
134062306a36Sopenharmony_ci	inode_io_list_move_locked(inode, wb, &wb->b_dirty);
134162306a36Sopenharmony_ci}
134262306a36Sopenharmony_ci
134362306a36Sopenharmony_cistatic void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
134462306a36Sopenharmony_ci{
134562306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
134662306a36Sopenharmony_ci	redirty_tail_locked(inode, wb);
134762306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
134862306a36Sopenharmony_ci}
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_ci/*
135162306a36Sopenharmony_ci * requeue inode for re-scanning after bdi->b_io list is exhausted.
135262306a36Sopenharmony_ci */
135362306a36Sopenharmony_cistatic void requeue_io(struct inode *inode, struct bdi_writeback *wb)
135462306a36Sopenharmony_ci{
135562306a36Sopenharmony_ci	inode_io_list_move_locked(inode, wb, &wb->b_more_io);
135662306a36Sopenharmony_ci}
135762306a36Sopenharmony_ci
135862306a36Sopenharmony_cistatic void inode_sync_complete(struct inode *inode)
135962306a36Sopenharmony_ci{
136062306a36Sopenharmony_ci	inode->i_state &= ~I_SYNC;
136162306a36Sopenharmony_ci	/* If inode is clean an unused, put it into LRU now... */
136262306a36Sopenharmony_ci	inode_add_lru(inode);
136362306a36Sopenharmony_ci	/* Waiters must see I_SYNC cleared before being woken up */
136462306a36Sopenharmony_ci	smp_mb();
136562306a36Sopenharmony_ci	wake_up_bit(&inode->i_state, __I_SYNC);
136662306a36Sopenharmony_ci}
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_cistatic bool inode_dirtied_after(struct inode *inode, unsigned long t)
136962306a36Sopenharmony_ci{
137062306a36Sopenharmony_ci	bool ret = time_after(inode->dirtied_when, t);
137162306a36Sopenharmony_ci#ifndef CONFIG_64BIT
137262306a36Sopenharmony_ci	/*
137362306a36Sopenharmony_ci	 * For inodes being constantly redirtied, dirtied_when can get stuck.
137462306a36Sopenharmony_ci	 * It _appears_ to be in the future, but is actually in distant past.
137562306a36Sopenharmony_ci	 * This test is necessary to prevent such wrapped-around relative times
137662306a36Sopenharmony_ci	 * from permanently stopping the whole bdi writeback.
137762306a36Sopenharmony_ci	 */
137862306a36Sopenharmony_ci	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
137962306a36Sopenharmony_ci#endif
138062306a36Sopenharmony_ci	return ret;
138162306a36Sopenharmony_ci}
138262306a36Sopenharmony_ci
138362306a36Sopenharmony_ci/*
138462306a36Sopenharmony_ci * Move expired (dirtied before dirtied_before) dirty inodes from
138562306a36Sopenharmony_ci * @delaying_queue to @dispatch_queue.
138662306a36Sopenharmony_ci */
138762306a36Sopenharmony_cistatic int move_expired_inodes(struct list_head *delaying_queue,
138862306a36Sopenharmony_ci			       struct list_head *dispatch_queue,
138962306a36Sopenharmony_ci			       unsigned long dirtied_before)
139062306a36Sopenharmony_ci{
139162306a36Sopenharmony_ci	LIST_HEAD(tmp);
139262306a36Sopenharmony_ci	struct list_head *pos, *node;
139362306a36Sopenharmony_ci	struct super_block *sb = NULL;
139462306a36Sopenharmony_ci	struct inode *inode;
139562306a36Sopenharmony_ci	int do_sb_sort = 0;
139662306a36Sopenharmony_ci	int moved = 0;
139762306a36Sopenharmony_ci
139862306a36Sopenharmony_ci	while (!list_empty(delaying_queue)) {
139962306a36Sopenharmony_ci		inode = wb_inode(delaying_queue->prev);
140062306a36Sopenharmony_ci		if (inode_dirtied_after(inode, dirtied_before))
140162306a36Sopenharmony_ci			break;
140262306a36Sopenharmony_ci		spin_lock(&inode->i_lock);
140362306a36Sopenharmony_ci		list_move(&inode->i_io_list, &tmp);
140462306a36Sopenharmony_ci		moved++;
140562306a36Sopenharmony_ci		inode->i_state |= I_SYNC_QUEUED;
140662306a36Sopenharmony_ci		spin_unlock(&inode->i_lock);
140762306a36Sopenharmony_ci		if (sb_is_blkdev_sb(inode->i_sb))
140862306a36Sopenharmony_ci			continue;
140962306a36Sopenharmony_ci		if (sb && sb != inode->i_sb)
141062306a36Sopenharmony_ci			do_sb_sort = 1;
141162306a36Sopenharmony_ci		sb = inode->i_sb;
141262306a36Sopenharmony_ci	}
141362306a36Sopenharmony_ci
141462306a36Sopenharmony_ci	/* just one sb in list, splice to dispatch_queue and we're done */
141562306a36Sopenharmony_ci	if (!do_sb_sort) {
141662306a36Sopenharmony_ci		list_splice(&tmp, dispatch_queue);
141762306a36Sopenharmony_ci		goto out;
141862306a36Sopenharmony_ci	}
141962306a36Sopenharmony_ci
142062306a36Sopenharmony_ci	/*
142162306a36Sopenharmony_ci	 * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
142262306a36Sopenharmony_ci	 * we don't take inode->i_lock here because it is just a pointless overhead.
142362306a36Sopenharmony_ci	 * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
142462306a36Sopenharmony_ci	 * fully under our control.
142562306a36Sopenharmony_ci	 */
142662306a36Sopenharmony_ci	while (!list_empty(&tmp)) {
142762306a36Sopenharmony_ci		sb = wb_inode(tmp.prev)->i_sb;
142862306a36Sopenharmony_ci		list_for_each_prev_safe(pos, node, &tmp) {
142962306a36Sopenharmony_ci			inode = wb_inode(pos);
143062306a36Sopenharmony_ci			if (inode->i_sb == sb)
143162306a36Sopenharmony_ci				list_move(&inode->i_io_list, dispatch_queue);
143262306a36Sopenharmony_ci		}
143362306a36Sopenharmony_ci	}
143462306a36Sopenharmony_ciout:
143562306a36Sopenharmony_ci	return moved;
143662306a36Sopenharmony_ci}
143762306a36Sopenharmony_ci
143862306a36Sopenharmony_ci/*
143962306a36Sopenharmony_ci * Queue all expired dirty inodes for io, eldest first.
144062306a36Sopenharmony_ci * Before
144162306a36Sopenharmony_ci *         newly dirtied     b_dirty    b_io    b_more_io
144262306a36Sopenharmony_ci *         =============>    gf         edc     BA
144362306a36Sopenharmony_ci * After
144462306a36Sopenharmony_ci *         newly dirtied     b_dirty    b_io    b_more_io
144562306a36Sopenharmony_ci *         =============>    g          fBAedc
144662306a36Sopenharmony_ci *                                           |
144762306a36Sopenharmony_ci *                                           +--> dequeue for IO
144862306a36Sopenharmony_ci */
144962306a36Sopenharmony_cistatic void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
145062306a36Sopenharmony_ci		     unsigned long dirtied_before)
145162306a36Sopenharmony_ci{
145262306a36Sopenharmony_ci	int moved;
145362306a36Sopenharmony_ci	unsigned long time_expire_jif = dirtied_before;
145462306a36Sopenharmony_ci
145562306a36Sopenharmony_ci	assert_spin_locked(&wb->list_lock);
145662306a36Sopenharmony_ci	list_splice_init(&wb->b_more_io, &wb->b_io);
145762306a36Sopenharmony_ci	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
145862306a36Sopenharmony_ci	if (!work->for_sync)
145962306a36Sopenharmony_ci		time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
146062306a36Sopenharmony_ci	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
146162306a36Sopenharmony_ci				     time_expire_jif);
146262306a36Sopenharmony_ci	if (moved)
146362306a36Sopenharmony_ci		wb_io_lists_populated(wb);
146462306a36Sopenharmony_ci	trace_writeback_queue_io(wb, work, dirtied_before, moved);
146562306a36Sopenharmony_ci}
146662306a36Sopenharmony_ci
146762306a36Sopenharmony_cistatic int write_inode(struct inode *inode, struct writeback_control *wbc)
146862306a36Sopenharmony_ci{
146962306a36Sopenharmony_ci	int ret;
147062306a36Sopenharmony_ci
147162306a36Sopenharmony_ci	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
147262306a36Sopenharmony_ci		trace_writeback_write_inode_start(inode, wbc);
147362306a36Sopenharmony_ci		ret = inode->i_sb->s_op->write_inode(inode, wbc);
147462306a36Sopenharmony_ci		trace_writeback_write_inode(inode, wbc);
147562306a36Sopenharmony_ci		return ret;
147662306a36Sopenharmony_ci	}
147762306a36Sopenharmony_ci	return 0;
147862306a36Sopenharmony_ci}
147962306a36Sopenharmony_ci
148062306a36Sopenharmony_ci/*
148162306a36Sopenharmony_ci * Wait for writeback on an inode to complete. Called with i_lock held.
148262306a36Sopenharmony_ci * Caller must make sure inode cannot go away when we drop i_lock.
148362306a36Sopenharmony_ci */
148462306a36Sopenharmony_cistatic void __inode_wait_for_writeback(struct inode *inode)
148562306a36Sopenharmony_ci	__releases(inode->i_lock)
148662306a36Sopenharmony_ci	__acquires(inode->i_lock)
148762306a36Sopenharmony_ci{
148862306a36Sopenharmony_ci	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
148962306a36Sopenharmony_ci	wait_queue_head_t *wqh;
149062306a36Sopenharmony_ci
149162306a36Sopenharmony_ci	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
149262306a36Sopenharmony_ci	while (inode->i_state & I_SYNC) {
149362306a36Sopenharmony_ci		spin_unlock(&inode->i_lock);
149462306a36Sopenharmony_ci		__wait_on_bit(wqh, &wq, bit_wait,
149562306a36Sopenharmony_ci			      TASK_UNINTERRUPTIBLE);
149662306a36Sopenharmony_ci		spin_lock(&inode->i_lock);
149762306a36Sopenharmony_ci	}
149862306a36Sopenharmony_ci}
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci/*
150162306a36Sopenharmony_ci * Wait for writeback on an inode to complete. Caller must have inode pinned.
150262306a36Sopenharmony_ci */
150362306a36Sopenharmony_civoid inode_wait_for_writeback(struct inode *inode)
150462306a36Sopenharmony_ci{
150562306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
150662306a36Sopenharmony_ci	__inode_wait_for_writeback(inode);
150762306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
150862306a36Sopenharmony_ci}
150962306a36Sopenharmony_ci
151062306a36Sopenharmony_ci/*
151162306a36Sopenharmony_ci * Sleep until I_SYNC is cleared. This function must be called with i_lock
151262306a36Sopenharmony_ci * held and drops it. It is aimed for callers not holding any inode reference
151362306a36Sopenharmony_ci * so once i_lock is dropped, inode can go away.
151462306a36Sopenharmony_ci */
151562306a36Sopenharmony_cistatic void inode_sleep_on_writeback(struct inode *inode)
151662306a36Sopenharmony_ci	__releases(inode->i_lock)
151762306a36Sopenharmony_ci{
151862306a36Sopenharmony_ci	DEFINE_WAIT(wait);
151962306a36Sopenharmony_ci	wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
152062306a36Sopenharmony_ci	int sleep;
152162306a36Sopenharmony_ci
152262306a36Sopenharmony_ci	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
152362306a36Sopenharmony_ci	sleep = inode->i_state & I_SYNC;
152462306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
152562306a36Sopenharmony_ci	if (sleep)
152662306a36Sopenharmony_ci		schedule();
152762306a36Sopenharmony_ci	finish_wait(wqh, &wait);
152862306a36Sopenharmony_ci}
152962306a36Sopenharmony_ci
153062306a36Sopenharmony_ci/*
153162306a36Sopenharmony_ci * Find proper writeback list for the inode depending on its current state and
153262306a36Sopenharmony_ci * possibly also change of its state while we were doing writeback.  Here we
153362306a36Sopenharmony_ci * handle things such as livelock prevention or fairness of writeback among
153462306a36Sopenharmony_ci * inodes. This function can be called only by flusher thread - noone else
153562306a36Sopenharmony_ci * processes all inodes in writeback lists and requeueing inodes behind flusher
153662306a36Sopenharmony_ci * thread's back can have unexpected consequences.
153762306a36Sopenharmony_ci */
153862306a36Sopenharmony_cistatic void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
153962306a36Sopenharmony_ci			  struct writeback_control *wbc)
154062306a36Sopenharmony_ci{
154162306a36Sopenharmony_ci	if (inode->i_state & I_FREEING)
154262306a36Sopenharmony_ci		return;
154362306a36Sopenharmony_ci
154462306a36Sopenharmony_ci	/*
154562306a36Sopenharmony_ci	 * Sync livelock prevention. Each inode is tagged and synced in one
154662306a36Sopenharmony_ci	 * shot. If still dirty, it will be redirty_tail()'ed below.  Update
154762306a36Sopenharmony_ci	 * the dirty time to prevent enqueue and sync it again.
154862306a36Sopenharmony_ci	 */
154962306a36Sopenharmony_ci	if ((inode->i_state & I_DIRTY) &&
155062306a36Sopenharmony_ci	    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
155162306a36Sopenharmony_ci		inode->dirtied_when = jiffies;
155262306a36Sopenharmony_ci
155362306a36Sopenharmony_ci	if (wbc->pages_skipped) {
155462306a36Sopenharmony_ci		/*
155562306a36Sopenharmony_ci		 * Writeback is not making progress due to locked buffers.
155662306a36Sopenharmony_ci		 * Skip this inode for now. Although having skipped pages
155762306a36Sopenharmony_ci		 * is odd for clean inodes, it can happen for some
155862306a36Sopenharmony_ci		 * filesystems so handle that gracefully.
155962306a36Sopenharmony_ci		 */
156062306a36Sopenharmony_ci		if (inode->i_state & I_DIRTY_ALL)
156162306a36Sopenharmony_ci			redirty_tail_locked(inode, wb);
156262306a36Sopenharmony_ci		else
156362306a36Sopenharmony_ci			inode_cgwb_move_to_attached(inode, wb);
156462306a36Sopenharmony_ci		return;
156562306a36Sopenharmony_ci	}
156662306a36Sopenharmony_ci
156762306a36Sopenharmony_ci	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
156862306a36Sopenharmony_ci		/*
156962306a36Sopenharmony_ci		 * We didn't write back all the pages.  nfs_writepages()
157062306a36Sopenharmony_ci		 * sometimes bales out without doing anything.
157162306a36Sopenharmony_ci		 */
157262306a36Sopenharmony_ci		if (wbc->nr_to_write <= 0) {
157362306a36Sopenharmony_ci			/* Slice used up. Queue for next turn. */
157462306a36Sopenharmony_ci			requeue_io(inode, wb);
157562306a36Sopenharmony_ci		} else {
157662306a36Sopenharmony_ci			/*
157762306a36Sopenharmony_ci			 * Writeback blocked by something other than
157862306a36Sopenharmony_ci			 * congestion. Delay the inode for some time to
157962306a36Sopenharmony_ci			 * avoid spinning on the CPU (100% iowait)
158062306a36Sopenharmony_ci			 * retrying writeback of the dirty page/inode
158162306a36Sopenharmony_ci			 * that cannot be performed immediately.
158262306a36Sopenharmony_ci			 */
158362306a36Sopenharmony_ci			redirty_tail_locked(inode, wb);
158462306a36Sopenharmony_ci		}
158562306a36Sopenharmony_ci	} else if (inode->i_state & I_DIRTY) {
158662306a36Sopenharmony_ci		/*
158762306a36Sopenharmony_ci		 * Filesystems can dirty the inode during writeback operations,
158862306a36Sopenharmony_ci		 * such as delayed allocation during submission or metadata
158962306a36Sopenharmony_ci		 * updates after data IO completion.
159062306a36Sopenharmony_ci		 */
159162306a36Sopenharmony_ci		redirty_tail_locked(inode, wb);
159262306a36Sopenharmony_ci	} else if (inode->i_state & I_DIRTY_TIME) {
159362306a36Sopenharmony_ci		inode->dirtied_when = jiffies;
159462306a36Sopenharmony_ci		inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
159562306a36Sopenharmony_ci		inode->i_state &= ~I_SYNC_QUEUED;
159662306a36Sopenharmony_ci	} else {
159762306a36Sopenharmony_ci		/* The inode is clean. Remove from writeback lists. */
159862306a36Sopenharmony_ci		inode_cgwb_move_to_attached(inode, wb);
159962306a36Sopenharmony_ci	}
160062306a36Sopenharmony_ci}
160162306a36Sopenharmony_ci
160262306a36Sopenharmony_ci/*
160362306a36Sopenharmony_ci * Write out an inode and its dirty pages (or some of its dirty pages, depending
160462306a36Sopenharmony_ci * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
160562306a36Sopenharmony_ci *
160662306a36Sopenharmony_ci * This doesn't remove the inode from the writeback list it is on, except
160762306a36Sopenharmony_ci * potentially to move it from b_dirty_time to b_dirty due to timestamp
160862306a36Sopenharmony_ci * expiration.  The caller is otherwise responsible for writeback list handling.
160962306a36Sopenharmony_ci *
161062306a36Sopenharmony_ci * The caller is also responsible for setting the I_SYNC flag beforehand and
161162306a36Sopenharmony_ci * calling inode_sync_complete() to clear it afterwards.
161262306a36Sopenharmony_ci */
161362306a36Sopenharmony_cistatic int
161462306a36Sopenharmony_ci__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
161562306a36Sopenharmony_ci{
161662306a36Sopenharmony_ci	struct address_space *mapping = inode->i_mapping;
161762306a36Sopenharmony_ci	long nr_to_write = wbc->nr_to_write;
161862306a36Sopenharmony_ci	unsigned dirty;
161962306a36Sopenharmony_ci	int ret;
162062306a36Sopenharmony_ci
162162306a36Sopenharmony_ci	WARN_ON(!(inode->i_state & I_SYNC));
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci	trace_writeback_single_inode_start(inode, wbc, nr_to_write);
162462306a36Sopenharmony_ci
162562306a36Sopenharmony_ci	ret = do_writepages(mapping, wbc);
162662306a36Sopenharmony_ci
162762306a36Sopenharmony_ci	/*
162862306a36Sopenharmony_ci	 * Make sure to wait on the data before writing out the metadata.
162962306a36Sopenharmony_ci	 * This is important for filesystems that modify metadata on data
163062306a36Sopenharmony_ci	 * I/O completion. We don't do it for sync(2) writeback because it has a
163162306a36Sopenharmony_ci	 * separate, external IO completion path and ->sync_fs for guaranteeing
163262306a36Sopenharmony_ci	 * inode metadata is written back correctly.
163362306a36Sopenharmony_ci	 */
163462306a36Sopenharmony_ci	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
163562306a36Sopenharmony_ci		int err = filemap_fdatawait(mapping);
163662306a36Sopenharmony_ci		if (ret == 0)
163762306a36Sopenharmony_ci			ret = err;
163862306a36Sopenharmony_ci	}
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci	/*
164162306a36Sopenharmony_ci	 * If the inode has dirty timestamps and we need to write them, call
164262306a36Sopenharmony_ci	 * mark_inode_dirty_sync() to notify the filesystem about it and to
164362306a36Sopenharmony_ci	 * change I_DIRTY_TIME into I_DIRTY_SYNC.
164462306a36Sopenharmony_ci	 */
164562306a36Sopenharmony_ci	if ((inode->i_state & I_DIRTY_TIME) &&
164662306a36Sopenharmony_ci	    (wbc->sync_mode == WB_SYNC_ALL ||
164762306a36Sopenharmony_ci	     time_after(jiffies, inode->dirtied_time_when +
164862306a36Sopenharmony_ci			dirtytime_expire_interval * HZ))) {
164962306a36Sopenharmony_ci		trace_writeback_lazytime(inode);
165062306a36Sopenharmony_ci		mark_inode_dirty_sync(inode);
165162306a36Sopenharmony_ci	}
165262306a36Sopenharmony_ci
165362306a36Sopenharmony_ci	/*
165462306a36Sopenharmony_ci	 * Get and clear the dirty flags from i_state.  This needs to be done
165562306a36Sopenharmony_ci	 * after calling writepages because some filesystems may redirty the
165662306a36Sopenharmony_ci	 * inode during writepages due to delalloc.  It also needs to be done
165762306a36Sopenharmony_ci	 * after handling timestamp expiration, as that may dirty the inode too.
165862306a36Sopenharmony_ci	 */
165962306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
166062306a36Sopenharmony_ci	dirty = inode->i_state & I_DIRTY;
166162306a36Sopenharmony_ci	inode->i_state &= ~dirty;
166262306a36Sopenharmony_ci
166362306a36Sopenharmony_ci	/*
166462306a36Sopenharmony_ci	 * Paired with smp_mb() in __mark_inode_dirty().  This allows
166562306a36Sopenharmony_ci	 * __mark_inode_dirty() to test i_state without grabbing i_lock -
166662306a36Sopenharmony_ci	 * either they see the I_DIRTY bits cleared or we see the dirtied
166762306a36Sopenharmony_ci	 * inode.
166862306a36Sopenharmony_ci	 *
166962306a36Sopenharmony_ci	 * I_DIRTY_PAGES is always cleared together above even if @mapping
167062306a36Sopenharmony_ci	 * still has dirty pages.  The flag is reinstated after smp_mb() if
167162306a36Sopenharmony_ci	 * necessary.  This guarantees that either __mark_inode_dirty()
167262306a36Sopenharmony_ci	 * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
167362306a36Sopenharmony_ci	 */
167462306a36Sopenharmony_ci	smp_mb();
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_ci	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
167762306a36Sopenharmony_ci		inode->i_state |= I_DIRTY_PAGES;
167862306a36Sopenharmony_ci	else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
167962306a36Sopenharmony_ci		if (!(inode->i_state & I_DIRTY_PAGES)) {
168062306a36Sopenharmony_ci			inode->i_state &= ~I_PINNING_FSCACHE_WB;
168162306a36Sopenharmony_ci			wbc->unpinned_fscache_wb = true;
168262306a36Sopenharmony_ci			dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
168362306a36Sopenharmony_ci		}
168462306a36Sopenharmony_ci	}
168562306a36Sopenharmony_ci
168662306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
168762306a36Sopenharmony_ci
168862306a36Sopenharmony_ci	/* Don't write the inode if only I_DIRTY_PAGES was set */
168962306a36Sopenharmony_ci	if (dirty & ~I_DIRTY_PAGES) {
169062306a36Sopenharmony_ci		int err = write_inode(inode, wbc);
169162306a36Sopenharmony_ci		if (ret == 0)
169262306a36Sopenharmony_ci			ret = err;
169362306a36Sopenharmony_ci	}
169462306a36Sopenharmony_ci	wbc->unpinned_fscache_wb = false;
169562306a36Sopenharmony_ci	trace_writeback_single_inode(inode, wbc, nr_to_write);
169662306a36Sopenharmony_ci	return ret;
169762306a36Sopenharmony_ci}
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci/*
170062306a36Sopenharmony_ci * Write out an inode's dirty data and metadata on-demand, i.e. separately from
170162306a36Sopenharmony_ci * the regular batched writeback done by the flusher threads in
170262306a36Sopenharmony_ci * writeback_sb_inodes().  @wbc controls various aspects of the write, such as
170362306a36Sopenharmony_ci * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
170462306a36Sopenharmony_ci *
170562306a36Sopenharmony_ci * To prevent the inode from going away, either the caller must have a reference
170662306a36Sopenharmony_ci * to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
170762306a36Sopenharmony_ci */
170862306a36Sopenharmony_cistatic int writeback_single_inode(struct inode *inode,
170962306a36Sopenharmony_ci				  struct writeback_control *wbc)
171062306a36Sopenharmony_ci{
171162306a36Sopenharmony_ci	struct bdi_writeback *wb;
171262306a36Sopenharmony_ci	int ret = 0;
171362306a36Sopenharmony_ci
171462306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
171562306a36Sopenharmony_ci	if (!atomic_read(&inode->i_count))
171662306a36Sopenharmony_ci		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
171762306a36Sopenharmony_ci	else
171862306a36Sopenharmony_ci		WARN_ON(inode->i_state & I_WILL_FREE);
171962306a36Sopenharmony_ci
172062306a36Sopenharmony_ci	if (inode->i_state & I_SYNC) {
172162306a36Sopenharmony_ci		/*
172262306a36Sopenharmony_ci		 * Writeback is already running on the inode.  For WB_SYNC_NONE,
172362306a36Sopenharmony_ci		 * that's enough and we can just return.  For WB_SYNC_ALL, we
172462306a36Sopenharmony_ci		 * must wait for the existing writeback to complete, then do
172562306a36Sopenharmony_ci		 * writeback again if there's anything left.
172662306a36Sopenharmony_ci		 */
172762306a36Sopenharmony_ci		if (wbc->sync_mode != WB_SYNC_ALL)
172862306a36Sopenharmony_ci			goto out;
172962306a36Sopenharmony_ci		__inode_wait_for_writeback(inode);
173062306a36Sopenharmony_ci	}
173162306a36Sopenharmony_ci	WARN_ON(inode->i_state & I_SYNC);
173262306a36Sopenharmony_ci	/*
173362306a36Sopenharmony_ci	 * If the inode is already fully clean, then there's nothing to do.
173462306a36Sopenharmony_ci	 *
173562306a36Sopenharmony_ci	 * For data-integrity syncs we also need to check whether any pages are
173662306a36Sopenharmony_ci	 * still under writeback, e.g. due to prior WB_SYNC_NONE writeback.  If
173762306a36Sopenharmony_ci	 * there are any such pages, we'll need to wait for them.
173862306a36Sopenharmony_ci	 */
173962306a36Sopenharmony_ci	if (!(inode->i_state & I_DIRTY_ALL) &&
174062306a36Sopenharmony_ci	    (wbc->sync_mode != WB_SYNC_ALL ||
174162306a36Sopenharmony_ci	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
174262306a36Sopenharmony_ci		goto out;
174362306a36Sopenharmony_ci	inode->i_state |= I_SYNC;
174462306a36Sopenharmony_ci	wbc_attach_and_unlock_inode(wbc, inode);
174562306a36Sopenharmony_ci
174662306a36Sopenharmony_ci	ret = __writeback_single_inode(inode, wbc);
174762306a36Sopenharmony_ci
174862306a36Sopenharmony_ci	wbc_detach_inode(wbc);
174962306a36Sopenharmony_ci
175062306a36Sopenharmony_ci	wb = inode_to_wb_and_lock_list(inode);
175162306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
175262306a36Sopenharmony_ci	/*
175362306a36Sopenharmony_ci	 * If the inode is freeing, its i_io_list shoudn't be updated
175462306a36Sopenharmony_ci	 * as it can be finally deleted at this moment.
175562306a36Sopenharmony_ci	 */
175662306a36Sopenharmony_ci	if (!(inode->i_state & I_FREEING)) {
175762306a36Sopenharmony_ci		/*
175862306a36Sopenharmony_ci		 * If the inode is now fully clean, then it can be safely
175962306a36Sopenharmony_ci		 * removed from its writeback list (if any). Otherwise the
176062306a36Sopenharmony_ci		 * flusher threads are responsible for the writeback lists.
176162306a36Sopenharmony_ci		 */
176262306a36Sopenharmony_ci		if (!(inode->i_state & I_DIRTY_ALL))
176362306a36Sopenharmony_ci			inode_cgwb_move_to_attached(inode, wb);
176462306a36Sopenharmony_ci		else if (!(inode->i_state & I_SYNC_QUEUED)) {
176562306a36Sopenharmony_ci			if ((inode->i_state & I_DIRTY))
176662306a36Sopenharmony_ci				redirty_tail_locked(inode, wb);
176762306a36Sopenharmony_ci			else if (inode->i_state & I_DIRTY_TIME) {
176862306a36Sopenharmony_ci				inode->dirtied_when = jiffies;
176962306a36Sopenharmony_ci				inode_io_list_move_locked(inode,
177062306a36Sopenharmony_ci							  wb,
177162306a36Sopenharmony_ci							  &wb->b_dirty_time);
177262306a36Sopenharmony_ci			}
177362306a36Sopenharmony_ci		}
177462306a36Sopenharmony_ci	}
177562306a36Sopenharmony_ci
177662306a36Sopenharmony_ci	spin_unlock(&wb->list_lock);
177762306a36Sopenharmony_ci	inode_sync_complete(inode);
177862306a36Sopenharmony_ciout:
177962306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
178062306a36Sopenharmony_ci	return ret;
178162306a36Sopenharmony_ci}
178262306a36Sopenharmony_ci
178362306a36Sopenharmony_cistatic long writeback_chunk_size(struct bdi_writeback *wb,
178462306a36Sopenharmony_ci				 struct wb_writeback_work *work)
178562306a36Sopenharmony_ci{
178662306a36Sopenharmony_ci	long pages;
178762306a36Sopenharmony_ci
178862306a36Sopenharmony_ci	/*
178962306a36Sopenharmony_ci	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
179062306a36Sopenharmony_ci	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
179162306a36Sopenharmony_ci	 * here avoids calling into writeback_inodes_wb() more than once.
179262306a36Sopenharmony_ci	 *
179362306a36Sopenharmony_ci	 * The intended call sequence for WB_SYNC_ALL writeback is:
179462306a36Sopenharmony_ci	 *
179562306a36Sopenharmony_ci	 *      wb_writeback()
179662306a36Sopenharmony_ci	 *          writeback_sb_inodes()       <== called only once
179762306a36Sopenharmony_ci	 *              write_cache_pages()     <== called once for each inode
179862306a36Sopenharmony_ci	 *                   (quickly) tag currently dirty pages
179962306a36Sopenharmony_ci	 *                   (maybe slowly) sync all tagged pages
180062306a36Sopenharmony_ci	 */
180162306a36Sopenharmony_ci	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
180262306a36Sopenharmony_ci		pages = LONG_MAX;
180362306a36Sopenharmony_ci	else {
180462306a36Sopenharmony_ci		pages = min(wb->avg_write_bandwidth / 2,
180562306a36Sopenharmony_ci			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
180662306a36Sopenharmony_ci		pages = min(pages, work->nr_pages);
180762306a36Sopenharmony_ci		pages = round_down(pages + MIN_WRITEBACK_PAGES,
180862306a36Sopenharmony_ci				   MIN_WRITEBACK_PAGES);
180962306a36Sopenharmony_ci	}
181062306a36Sopenharmony_ci
181162306a36Sopenharmony_ci	return pages;
181262306a36Sopenharmony_ci}
181362306a36Sopenharmony_ci
181462306a36Sopenharmony_ci/*
181562306a36Sopenharmony_ci * Write a portion of b_io inodes which belong to @sb.
181662306a36Sopenharmony_ci *
181762306a36Sopenharmony_ci * Return the number of pages and/or inodes written.
181862306a36Sopenharmony_ci *
181962306a36Sopenharmony_ci * NOTE! This is called with wb->list_lock held, and will
182062306a36Sopenharmony_ci * unlock and relock that for each inode it ends up doing
182162306a36Sopenharmony_ci * IO for.
182262306a36Sopenharmony_ci */
182362306a36Sopenharmony_cistatic long writeback_sb_inodes(struct super_block *sb,
182462306a36Sopenharmony_ci				struct bdi_writeback *wb,
182562306a36Sopenharmony_ci				struct wb_writeback_work *work)
182662306a36Sopenharmony_ci{
182762306a36Sopenharmony_ci	struct writeback_control wbc = {
182862306a36Sopenharmony_ci		.sync_mode		= work->sync_mode,
182962306a36Sopenharmony_ci		.tagged_writepages	= work->tagged_writepages,
183062306a36Sopenharmony_ci		.for_kupdate		= work->for_kupdate,
183162306a36Sopenharmony_ci		.for_background		= work->for_background,
183262306a36Sopenharmony_ci		.for_sync		= work->for_sync,
183362306a36Sopenharmony_ci		.range_cyclic		= work->range_cyclic,
183462306a36Sopenharmony_ci		.range_start		= 0,
183562306a36Sopenharmony_ci		.range_end		= LLONG_MAX,
183662306a36Sopenharmony_ci	};
183762306a36Sopenharmony_ci	unsigned long start_time = jiffies;
183862306a36Sopenharmony_ci	long write_chunk;
183962306a36Sopenharmony_ci	long total_wrote = 0;  /* count both pages and inodes */
184062306a36Sopenharmony_ci
184162306a36Sopenharmony_ci	while (!list_empty(&wb->b_io)) {
184262306a36Sopenharmony_ci		struct inode *inode = wb_inode(wb->b_io.prev);
184362306a36Sopenharmony_ci		struct bdi_writeback *tmp_wb;
184462306a36Sopenharmony_ci		long wrote;
184562306a36Sopenharmony_ci
184662306a36Sopenharmony_ci		if (inode->i_sb != sb) {
184762306a36Sopenharmony_ci			if (work->sb) {
184862306a36Sopenharmony_ci				/*
184962306a36Sopenharmony_ci				 * We only want to write back data for this
185062306a36Sopenharmony_ci				 * superblock, move all inodes not belonging
185162306a36Sopenharmony_ci				 * to it back onto the dirty list.
185262306a36Sopenharmony_ci				 */
185362306a36Sopenharmony_ci				redirty_tail(inode, wb);
185462306a36Sopenharmony_ci				continue;
185562306a36Sopenharmony_ci			}
185662306a36Sopenharmony_ci
185762306a36Sopenharmony_ci			/*
185862306a36Sopenharmony_ci			 * The inode belongs to a different superblock.
185962306a36Sopenharmony_ci			 * Bounce back to the caller to unpin this and
186062306a36Sopenharmony_ci			 * pin the next superblock.
186162306a36Sopenharmony_ci			 */
186262306a36Sopenharmony_ci			break;
186362306a36Sopenharmony_ci		}
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ci		/*
186662306a36Sopenharmony_ci		 * Don't bother with new inodes or inodes being freed, first
186762306a36Sopenharmony_ci		 * kind does not need periodic writeout yet, and for the latter
186862306a36Sopenharmony_ci		 * kind writeout is handled by the freer.
186962306a36Sopenharmony_ci		 */
187062306a36Sopenharmony_ci		spin_lock(&inode->i_lock);
187162306a36Sopenharmony_ci		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
187262306a36Sopenharmony_ci			redirty_tail_locked(inode, wb);
187362306a36Sopenharmony_ci			spin_unlock(&inode->i_lock);
187462306a36Sopenharmony_ci			continue;
187562306a36Sopenharmony_ci		}
187662306a36Sopenharmony_ci		if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
187762306a36Sopenharmony_ci			/*
187862306a36Sopenharmony_ci			 * If this inode is locked for writeback and we are not
187962306a36Sopenharmony_ci			 * doing writeback-for-data-integrity, move it to
188062306a36Sopenharmony_ci			 * b_more_io so that writeback can proceed with the
188162306a36Sopenharmony_ci			 * other inodes on s_io.
188262306a36Sopenharmony_ci			 *
188362306a36Sopenharmony_ci			 * We'll have another go at writing back this inode
188462306a36Sopenharmony_ci			 * when we completed a full scan of b_io.
188562306a36Sopenharmony_ci			 */
188662306a36Sopenharmony_ci			requeue_io(inode, wb);
188762306a36Sopenharmony_ci			spin_unlock(&inode->i_lock);
188862306a36Sopenharmony_ci			trace_writeback_sb_inodes_requeue(inode);
188962306a36Sopenharmony_ci			continue;
189062306a36Sopenharmony_ci		}
189162306a36Sopenharmony_ci		spin_unlock(&wb->list_lock);
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_ci		/*
189462306a36Sopenharmony_ci		 * We already requeued the inode if it had I_SYNC set and we
189562306a36Sopenharmony_ci		 * are doing WB_SYNC_NONE writeback. So this catches only the
189662306a36Sopenharmony_ci		 * WB_SYNC_ALL case.
189762306a36Sopenharmony_ci		 */
189862306a36Sopenharmony_ci		if (inode->i_state & I_SYNC) {
189962306a36Sopenharmony_ci			/* Wait for I_SYNC. This function drops i_lock... */
190062306a36Sopenharmony_ci			inode_sleep_on_writeback(inode);
190162306a36Sopenharmony_ci			/* Inode may be gone, start again */
190262306a36Sopenharmony_ci			spin_lock(&wb->list_lock);
190362306a36Sopenharmony_ci			continue;
190462306a36Sopenharmony_ci		}
190562306a36Sopenharmony_ci		inode->i_state |= I_SYNC;
190662306a36Sopenharmony_ci		wbc_attach_and_unlock_inode(&wbc, inode);
190762306a36Sopenharmony_ci
190862306a36Sopenharmony_ci		write_chunk = writeback_chunk_size(wb, work);
190962306a36Sopenharmony_ci		wbc.nr_to_write = write_chunk;
191062306a36Sopenharmony_ci		wbc.pages_skipped = 0;
191162306a36Sopenharmony_ci
191262306a36Sopenharmony_ci		/*
191362306a36Sopenharmony_ci		 * We use I_SYNC to pin the inode in memory. While it is set
191462306a36Sopenharmony_ci		 * evict_inode() will wait so the inode cannot be freed.
191562306a36Sopenharmony_ci		 */
191662306a36Sopenharmony_ci		__writeback_single_inode(inode, &wbc);
191762306a36Sopenharmony_ci
191862306a36Sopenharmony_ci		wbc_detach_inode(&wbc);
191962306a36Sopenharmony_ci		work->nr_pages -= write_chunk - wbc.nr_to_write;
192062306a36Sopenharmony_ci		wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
192162306a36Sopenharmony_ci		wrote = wrote < 0 ? 0 : wrote;
192262306a36Sopenharmony_ci		total_wrote += wrote;
192362306a36Sopenharmony_ci
192462306a36Sopenharmony_ci		if (need_resched()) {
192562306a36Sopenharmony_ci			/*
192662306a36Sopenharmony_ci			 * We're trying to balance between building up a nice
192762306a36Sopenharmony_ci			 * long list of IOs to improve our merge rate, and
192862306a36Sopenharmony_ci			 * getting those IOs out quickly for anyone throttling
192962306a36Sopenharmony_ci			 * in balance_dirty_pages().  cond_resched() doesn't
193062306a36Sopenharmony_ci			 * unplug, so get our IOs out the door before we
193162306a36Sopenharmony_ci			 * give up the CPU.
193262306a36Sopenharmony_ci			 */
193362306a36Sopenharmony_ci			blk_flush_plug(current->plug, false);
193462306a36Sopenharmony_ci			cond_resched();
193562306a36Sopenharmony_ci		}
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_ci		/*
193862306a36Sopenharmony_ci		 * Requeue @inode if still dirty.  Be careful as @inode may
193962306a36Sopenharmony_ci		 * have been switched to another wb in the meantime.
194062306a36Sopenharmony_ci		 */
194162306a36Sopenharmony_ci		tmp_wb = inode_to_wb_and_lock_list(inode);
194262306a36Sopenharmony_ci		spin_lock(&inode->i_lock);
194362306a36Sopenharmony_ci		if (!(inode->i_state & I_DIRTY_ALL))
194462306a36Sopenharmony_ci			total_wrote++;
194562306a36Sopenharmony_ci		requeue_inode(inode, tmp_wb, &wbc);
194662306a36Sopenharmony_ci		inode_sync_complete(inode);
194762306a36Sopenharmony_ci		spin_unlock(&inode->i_lock);
194862306a36Sopenharmony_ci
194962306a36Sopenharmony_ci		if (unlikely(tmp_wb != wb)) {
195062306a36Sopenharmony_ci			spin_unlock(&tmp_wb->list_lock);
195162306a36Sopenharmony_ci			spin_lock(&wb->list_lock);
195262306a36Sopenharmony_ci		}
195362306a36Sopenharmony_ci
195462306a36Sopenharmony_ci		/*
195562306a36Sopenharmony_ci		 * bail out to wb_writeback() often enough to check
195662306a36Sopenharmony_ci		 * background threshold and other termination conditions.
195762306a36Sopenharmony_ci		 */
195862306a36Sopenharmony_ci		if (total_wrote) {
195962306a36Sopenharmony_ci			if (time_is_before_jiffies(start_time + HZ / 10UL))
196062306a36Sopenharmony_ci				break;
196162306a36Sopenharmony_ci			if (work->nr_pages <= 0)
196262306a36Sopenharmony_ci				break;
196362306a36Sopenharmony_ci		}
196462306a36Sopenharmony_ci	}
196562306a36Sopenharmony_ci	return total_wrote;
196662306a36Sopenharmony_ci}
196762306a36Sopenharmony_ci
196862306a36Sopenharmony_cistatic long __writeback_inodes_wb(struct bdi_writeback *wb,
196962306a36Sopenharmony_ci				  struct wb_writeback_work *work)
197062306a36Sopenharmony_ci{
197162306a36Sopenharmony_ci	unsigned long start_time = jiffies;
197262306a36Sopenharmony_ci	long wrote = 0;
197362306a36Sopenharmony_ci
197462306a36Sopenharmony_ci	while (!list_empty(&wb->b_io)) {
197562306a36Sopenharmony_ci		struct inode *inode = wb_inode(wb->b_io.prev);
197662306a36Sopenharmony_ci		struct super_block *sb = inode->i_sb;
197762306a36Sopenharmony_ci
197862306a36Sopenharmony_ci		if (!super_trylock_shared(sb)) {
197962306a36Sopenharmony_ci			/*
198062306a36Sopenharmony_ci			 * super_trylock_shared() may fail consistently due to
198162306a36Sopenharmony_ci			 * s_umount being grabbed by someone else. Don't use
198262306a36Sopenharmony_ci			 * requeue_io() to avoid busy retrying the inode/sb.
198362306a36Sopenharmony_ci			 */
198462306a36Sopenharmony_ci			redirty_tail(inode, wb);
198562306a36Sopenharmony_ci			continue;
198662306a36Sopenharmony_ci		}
198762306a36Sopenharmony_ci		wrote += writeback_sb_inodes(sb, wb, work);
198862306a36Sopenharmony_ci		up_read(&sb->s_umount);
198962306a36Sopenharmony_ci
199062306a36Sopenharmony_ci		/* refer to the same tests at the end of writeback_sb_inodes */
199162306a36Sopenharmony_ci		if (wrote) {
199262306a36Sopenharmony_ci			if (time_is_before_jiffies(start_time + HZ / 10UL))
199362306a36Sopenharmony_ci				break;
199462306a36Sopenharmony_ci			if (work->nr_pages <= 0)
199562306a36Sopenharmony_ci				break;
199662306a36Sopenharmony_ci		}
199762306a36Sopenharmony_ci	}
199862306a36Sopenharmony_ci	/* Leave any unwritten inodes on b_io */
199962306a36Sopenharmony_ci	return wrote;
200062306a36Sopenharmony_ci}
200162306a36Sopenharmony_ci
200262306a36Sopenharmony_cistatic long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
200362306a36Sopenharmony_ci				enum wb_reason reason)
200462306a36Sopenharmony_ci{
200562306a36Sopenharmony_ci	struct wb_writeback_work work = {
200662306a36Sopenharmony_ci		.nr_pages	= nr_pages,
200762306a36Sopenharmony_ci		.sync_mode	= WB_SYNC_NONE,
200862306a36Sopenharmony_ci		.range_cyclic	= 1,
200962306a36Sopenharmony_ci		.reason		= reason,
201062306a36Sopenharmony_ci	};
201162306a36Sopenharmony_ci	struct blk_plug plug;
201262306a36Sopenharmony_ci
201362306a36Sopenharmony_ci	blk_start_plug(&plug);
201462306a36Sopenharmony_ci	spin_lock(&wb->list_lock);
201562306a36Sopenharmony_ci	if (list_empty(&wb->b_io))
201662306a36Sopenharmony_ci		queue_io(wb, &work, jiffies);
201762306a36Sopenharmony_ci	__writeback_inodes_wb(wb, &work);
201862306a36Sopenharmony_ci	spin_unlock(&wb->list_lock);
201962306a36Sopenharmony_ci	blk_finish_plug(&plug);
202062306a36Sopenharmony_ci
202162306a36Sopenharmony_ci	return nr_pages - work.nr_pages;
202262306a36Sopenharmony_ci}
202362306a36Sopenharmony_ci
202462306a36Sopenharmony_ci/*
202562306a36Sopenharmony_ci * Explicit flushing or periodic writeback of "old" data.
202662306a36Sopenharmony_ci *
202762306a36Sopenharmony_ci * Define "old": the first time one of an inode's pages is dirtied, we mark the
202862306a36Sopenharmony_ci * dirtying-time in the inode's address_space.  So this periodic writeback code
202962306a36Sopenharmony_ci * just walks the superblock inode list, writing back any inodes which are
203062306a36Sopenharmony_ci * older than a specific point in time.
203162306a36Sopenharmony_ci *
203262306a36Sopenharmony_ci * Try to run once per dirty_writeback_interval.  But if a writeback event
203362306a36Sopenharmony_ci * takes longer than a dirty_writeback_interval interval, then leave a
203462306a36Sopenharmony_ci * one-second gap.
203562306a36Sopenharmony_ci *
203662306a36Sopenharmony_ci * dirtied_before takes precedence over nr_to_write.  So we'll only write back
203762306a36Sopenharmony_ci * all dirty pages if they are all attached to "old" mappings.
203862306a36Sopenharmony_ci */
203962306a36Sopenharmony_cistatic long wb_writeback(struct bdi_writeback *wb,
204062306a36Sopenharmony_ci			 struct wb_writeback_work *work)
204162306a36Sopenharmony_ci{
204262306a36Sopenharmony_ci	long nr_pages = work->nr_pages;
204362306a36Sopenharmony_ci	unsigned long dirtied_before = jiffies;
204462306a36Sopenharmony_ci	struct inode *inode;
204562306a36Sopenharmony_ci	long progress;
204662306a36Sopenharmony_ci	struct blk_plug plug;
204762306a36Sopenharmony_ci
204862306a36Sopenharmony_ci	blk_start_plug(&plug);
204962306a36Sopenharmony_ci	for (;;) {
205062306a36Sopenharmony_ci		/*
205162306a36Sopenharmony_ci		 * Stop writeback when nr_pages has been consumed
205262306a36Sopenharmony_ci		 */
205362306a36Sopenharmony_ci		if (work->nr_pages <= 0)
205462306a36Sopenharmony_ci			break;
205562306a36Sopenharmony_ci
205662306a36Sopenharmony_ci		/*
205762306a36Sopenharmony_ci		 * Background writeout and kupdate-style writeback may
205862306a36Sopenharmony_ci		 * run forever. Stop them if there is other work to do
205962306a36Sopenharmony_ci		 * so that e.g. sync can proceed. They'll be restarted
206062306a36Sopenharmony_ci		 * after the other works are all done.
206162306a36Sopenharmony_ci		 */
206262306a36Sopenharmony_ci		if ((work->for_background || work->for_kupdate) &&
206362306a36Sopenharmony_ci		    !list_empty(&wb->work_list))
206462306a36Sopenharmony_ci			break;
206562306a36Sopenharmony_ci
206662306a36Sopenharmony_ci		/*
206762306a36Sopenharmony_ci		 * For background writeout, stop when we are below the
206862306a36Sopenharmony_ci		 * background dirty threshold
206962306a36Sopenharmony_ci		 */
207062306a36Sopenharmony_ci		if (work->for_background && !wb_over_bg_thresh(wb))
207162306a36Sopenharmony_ci			break;
207262306a36Sopenharmony_ci
207362306a36Sopenharmony_ci
207462306a36Sopenharmony_ci		spin_lock(&wb->list_lock);
207562306a36Sopenharmony_ci
207662306a36Sopenharmony_ci		/*
207762306a36Sopenharmony_ci		 * Kupdate and background works are special and we want to
207862306a36Sopenharmony_ci		 * include all inodes that need writing. Livelock avoidance is
207962306a36Sopenharmony_ci		 * handled by these works yielding to any other work so we are
208062306a36Sopenharmony_ci		 * safe.
208162306a36Sopenharmony_ci		 */
208262306a36Sopenharmony_ci		if (work->for_kupdate) {
208362306a36Sopenharmony_ci			dirtied_before = jiffies -
208462306a36Sopenharmony_ci				msecs_to_jiffies(dirty_expire_interval * 10);
208562306a36Sopenharmony_ci		} else if (work->for_background)
208662306a36Sopenharmony_ci			dirtied_before = jiffies;
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_ci		trace_writeback_start(wb, work);
208962306a36Sopenharmony_ci		if (list_empty(&wb->b_io))
209062306a36Sopenharmony_ci			queue_io(wb, work, dirtied_before);
209162306a36Sopenharmony_ci		if (work->sb)
209262306a36Sopenharmony_ci			progress = writeback_sb_inodes(work->sb, wb, work);
209362306a36Sopenharmony_ci		else
209462306a36Sopenharmony_ci			progress = __writeback_inodes_wb(wb, work);
209562306a36Sopenharmony_ci		trace_writeback_written(wb, work);
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ci		/*
209862306a36Sopenharmony_ci		 * Did we write something? Try for more
209962306a36Sopenharmony_ci		 *
210062306a36Sopenharmony_ci		 * Dirty inodes are moved to b_io for writeback in batches.
210162306a36Sopenharmony_ci		 * The completion of the current batch does not necessarily
210262306a36Sopenharmony_ci		 * mean the overall work is done. So we keep looping as long
210362306a36Sopenharmony_ci		 * as made some progress on cleaning pages or inodes.
210462306a36Sopenharmony_ci		 */
210562306a36Sopenharmony_ci		if (progress) {
210662306a36Sopenharmony_ci			spin_unlock(&wb->list_lock);
210762306a36Sopenharmony_ci			continue;
210862306a36Sopenharmony_ci		}
210962306a36Sopenharmony_ci
211062306a36Sopenharmony_ci		/*
211162306a36Sopenharmony_ci		 * No more inodes for IO, bail
211262306a36Sopenharmony_ci		 */
211362306a36Sopenharmony_ci		if (list_empty(&wb->b_more_io)) {
211462306a36Sopenharmony_ci			spin_unlock(&wb->list_lock);
211562306a36Sopenharmony_ci			break;
211662306a36Sopenharmony_ci		}
211762306a36Sopenharmony_ci
211862306a36Sopenharmony_ci		/*
211962306a36Sopenharmony_ci		 * Nothing written. Wait for some inode to
212062306a36Sopenharmony_ci		 * become available for writeback. Otherwise
212162306a36Sopenharmony_ci		 * we'll just busyloop.
212262306a36Sopenharmony_ci		 */
212362306a36Sopenharmony_ci		trace_writeback_wait(wb, work);
212462306a36Sopenharmony_ci		inode = wb_inode(wb->b_more_io.prev);
212562306a36Sopenharmony_ci		spin_lock(&inode->i_lock);
212662306a36Sopenharmony_ci		spin_unlock(&wb->list_lock);
212762306a36Sopenharmony_ci		/* This function drops i_lock... */
212862306a36Sopenharmony_ci		inode_sleep_on_writeback(inode);
212962306a36Sopenharmony_ci	}
213062306a36Sopenharmony_ci	blk_finish_plug(&plug);
213162306a36Sopenharmony_ci
213262306a36Sopenharmony_ci	return nr_pages - work->nr_pages;
213362306a36Sopenharmony_ci}
213462306a36Sopenharmony_ci
213562306a36Sopenharmony_ci/*
213662306a36Sopenharmony_ci * Return the next wb_writeback_work struct that hasn't been processed yet.
213762306a36Sopenharmony_ci */
213862306a36Sopenharmony_cistatic struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
213962306a36Sopenharmony_ci{
214062306a36Sopenharmony_ci	struct wb_writeback_work *work = NULL;
214162306a36Sopenharmony_ci
214262306a36Sopenharmony_ci	spin_lock_irq(&wb->work_lock);
214362306a36Sopenharmony_ci	if (!list_empty(&wb->work_list)) {
214462306a36Sopenharmony_ci		work = list_entry(wb->work_list.next,
214562306a36Sopenharmony_ci				  struct wb_writeback_work, list);
214662306a36Sopenharmony_ci		list_del_init(&work->list);
214762306a36Sopenharmony_ci	}
214862306a36Sopenharmony_ci	spin_unlock_irq(&wb->work_lock);
214962306a36Sopenharmony_ci	return work;
215062306a36Sopenharmony_ci}
215162306a36Sopenharmony_ci
215262306a36Sopenharmony_cistatic long wb_check_background_flush(struct bdi_writeback *wb)
215362306a36Sopenharmony_ci{
215462306a36Sopenharmony_ci	if (wb_over_bg_thresh(wb)) {
215562306a36Sopenharmony_ci
215662306a36Sopenharmony_ci		struct wb_writeback_work work = {
215762306a36Sopenharmony_ci			.nr_pages	= LONG_MAX,
215862306a36Sopenharmony_ci			.sync_mode	= WB_SYNC_NONE,
215962306a36Sopenharmony_ci			.for_background	= 1,
216062306a36Sopenharmony_ci			.range_cyclic	= 1,
216162306a36Sopenharmony_ci			.reason		= WB_REASON_BACKGROUND,
216262306a36Sopenharmony_ci		};
216362306a36Sopenharmony_ci
216462306a36Sopenharmony_ci		return wb_writeback(wb, &work);
216562306a36Sopenharmony_ci	}
216662306a36Sopenharmony_ci
216762306a36Sopenharmony_ci	return 0;
216862306a36Sopenharmony_ci}
216962306a36Sopenharmony_ci
217062306a36Sopenharmony_cistatic long wb_check_old_data_flush(struct bdi_writeback *wb)
217162306a36Sopenharmony_ci{
217262306a36Sopenharmony_ci	unsigned long expired;
217362306a36Sopenharmony_ci	long nr_pages;
217462306a36Sopenharmony_ci
217562306a36Sopenharmony_ci	/*
217662306a36Sopenharmony_ci	 * When set to zero, disable periodic writeback
217762306a36Sopenharmony_ci	 */
217862306a36Sopenharmony_ci	if (!dirty_writeback_interval)
217962306a36Sopenharmony_ci		return 0;
218062306a36Sopenharmony_ci
218162306a36Sopenharmony_ci	expired = wb->last_old_flush +
218262306a36Sopenharmony_ci			msecs_to_jiffies(dirty_writeback_interval * 10);
218362306a36Sopenharmony_ci	if (time_before(jiffies, expired))
218462306a36Sopenharmony_ci		return 0;
218562306a36Sopenharmony_ci
218662306a36Sopenharmony_ci	wb->last_old_flush = jiffies;
218762306a36Sopenharmony_ci	nr_pages = get_nr_dirty_pages();
218862306a36Sopenharmony_ci
218962306a36Sopenharmony_ci	if (nr_pages) {
219062306a36Sopenharmony_ci		struct wb_writeback_work work = {
219162306a36Sopenharmony_ci			.nr_pages	= nr_pages,
219262306a36Sopenharmony_ci			.sync_mode	= WB_SYNC_NONE,
219362306a36Sopenharmony_ci			.for_kupdate	= 1,
219462306a36Sopenharmony_ci			.range_cyclic	= 1,
219562306a36Sopenharmony_ci			.reason		= WB_REASON_PERIODIC,
219662306a36Sopenharmony_ci		};
219762306a36Sopenharmony_ci
219862306a36Sopenharmony_ci		return wb_writeback(wb, &work);
219962306a36Sopenharmony_ci	}
220062306a36Sopenharmony_ci
220162306a36Sopenharmony_ci	return 0;
220262306a36Sopenharmony_ci}
220362306a36Sopenharmony_ci
220462306a36Sopenharmony_cistatic long wb_check_start_all(struct bdi_writeback *wb)
220562306a36Sopenharmony_ci{
220662306a36Sopenharmony_ci	long nr_pages;
220762306a36Sopenharmony_ci
220862306a36Sopenharmony_ci	if (!test_bit(WB_start_all, &wb->state))
220962306a36Sopenharmony_ci		return 0;
221062306a36Sopenharmony_ci
221162306a36Sopenharmony_ci	nr_pages = get_nr_dirty_pages();
221262306a36Sopenharmony_ci	if (nr_pages) {
221362306a36Sopenharmony_ci		struct wb_writeback_work work = {
221462306a36Sopenharmony_ci			.nr_pages	= wb_split_bdi_pages(wb, nr_pages),
221562306a36Sopenharmony_ci			.sync_mode	= WB_SYNC_NONE,
221662306a36Sopenharmony_ci			.range_cyclic	= 1,
221762306a36Sopenharmony_ci			.reason		= wb->start_all_reason,
221862306a36Sopenharmony_ci		};
221962306a36Sopenharmony_ci
222062306a36Sopenharmony_ci		nr_pages = wb_writeback(wb, &work);
222162306a36Sopenharmony_ci	}
222262306a36Sopenharmony_ci
222362306a36Sopenharmony_ci	clear_bit(WB_start_all, &wb->state);
222462306a36Sopenharmony_ci	return nr_pages;
222562306a36Sopenharmony_ci}
222662306a36Sopenharmony_ci
222762306a36Sopenharmony_ci
222862306a36Sopenharmony_ci/*
222962306a36Sopenharmony_ci * Retrieve work items and do the writeback they describe
223062306a36Sopenharmony_ci */
223162306a36Sopenharmony_cistatic long wb_do_writeback(struct bdi_writeback *wb)
223262306a36Sopenharmony_ci{
223362306a36Sopenharmony_ci	struct wb_writeback_work *work;
223462306a36Sopenharmony_ci	long wrote = 0;
223562306a36Sopenharmony_ci
223662306a36Sopenharmony_ci	set_bit(WB_writeback_running, &wb->state);
223762306a36Sopenharmony_ci	while ((work = get_next_work_item(wb)) != NULL) {
223862306a36Sopenharmony_ci		trace_writeback_exec(wb, work);
223962306a36Sopenharmony_ci		wrote += wb_writeback(wb, work);
224062306a36Sopenharmony_ci		finish_writeback_work(wb, work);
224162306a36Sopenharmony_ci	}
224262306a36Sopenharmony_ci
224362306a36Sopenharmony_ci	/*
224462306a36Sopenharmony_ci	 * Check for a flush-everything request
224562306a36Sopenharmony_ci	 */
224662306a36Sopenharmony_ci	wrote += wb_check_start_all(wb);
224762306a36Sopenharmony_ci
224862306a36Sopenharmony_ci	/*
224962306a36Sopenharmony_ci	 * Check for periodic writeback, kupdated() style
225062306a36Sopenharmony_ci	 */
225162306a36Sopenharmony_ci	wrote += wb_check_old_data_flush(wb);
225262306a36Sopenharmony_ci	wrote += wb_check_background_flush(wb);
225362306a36Sopenharmony_ci	clear_bit(WB_writeback_running, &wb->state);
225462306a36Sopenharmony_ci
225562306a36Sopenharmony_ci	return wrote;
225662306a36Sopenharmony_ci}
225762306a36Sopenharmony_ci
225862306a36Sopenharmony_ci/*
225962306a36Sopenharmony_ci * Handle writeback of dirty data for the device backed by this bdi. Also
226062306a36Sopenharmony_ci * reschedules periodically and does kupdated style flushing.
226162306a36Sopenharmony_ci */
226262306a36Sopenharmony_civoid wb_workfn(struct work_struct *work)
226362306a36Sopenharmony_ci{
226462306a36Sopenharmony_ci	struct bdi_writeback *wb = container_of(to_delayed_work(work),
226562306a36Sopenharmony_ci						struct bdi_writeback, dwork);
226662306a36Sopenharmony_ci	long pages_written;
226762306a36Sopenharmony_ci
226862306a36Sopenharmony_ci	set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
226962306a36Sopenharmony_ci
227062306a36Sopenharmony_ci	if (likely(!current_is_workqueue_rescuer() ||
227162306a36Sopenharmony_ci		   !test_bit(WB_registered, &wb->state))) {
227262306a36Sopenharmony_ci		/*
227362306a36Sopenharmony_ci		 * The normal path.  Keep writing back @wb until its
227462306a36Sopenharmony_ci		 * work_list is empty.  Note that this path is also taken
227562306a36Sopenharmony_ci		 * if @wb is shutting down even when we're running off the
227662306a36Sopenharmony_ci		 * rescuer as work_list needs to be drained.
227762306a36Sopenharmony_ci		 */
227862306a36Sopenharmony_ci		do {
227962306a36Sopenharmony_ci			pages_written = wb_do_writeback(wb);
228062306a36Sopenharmony_ci			trace_writeback_pages_written(pages_written);
228162306a36Sopenharmony_ci		} while (!list_empty(&wb->work_list));
228262306a36Sopenharmony_ci	} else {
228362306a36Sopenharmony_ci		/*
228462306a36Sopenharmony_ci		 * bdi_wq can't get enough workers and we're running off
228562306a36Sopenharmony_ci		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
228662306a36Sopenharmony_ci		 * enough for efficient IO.
228762306a36Sopenharmony_ci		 */
228862306a36Sopenharmony_ci		pages_written = writeback_inodes_wb(wb, 1024,
228962306a36Sopenharmony_ci						    WB_REASON_FORKER_THREAD);
229062306a36Sopenharmony_ci		trace_writeback_pages_written(pages_written);
229162306a36Sopenharmony_ci	}
229262306a36Sopenharmony_ci
229362306a36Sopenharmony_ci	if (!list_empty(&wb->work_list))
229462306a36Sopenharmony_ci		wb_wakeup(wb);
229562306a36Sopenharmony_ci	else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
229662306a36Sopenharmony_ci		wb_wakeup_delayed(wb);
229762306a36Sopenharmony_ci}
229862306a36Sopenharmony_ci
229962306a36Sopenharmony_ci/*
230062306a36Sopenharmony_ci * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
230162306a36Sopenharmony_ci * write back the whole world.
230262306a36Sopenharmony_ci */
230362306a36Sopenharmony_cistatic void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
230462306a36Sopenharmony_ci					 enum wb_reason reason)
230562306a36Sopenharmony_ci{
230662306a36Sopenharmony_ci	struct bdi_writeback *wb;
230762306a36Sopenharmony_ci
230862306a36Sopenharmony_ci	if (!bdi_has_dirty_io(bdi))
230962306a36Sopenharmony_ci		return;
231062306a36Sopenharmony_ci
231162306a36Sopenharmony_ci	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
231262306a36Sopenharmony_ci		wb_start_writeback(wb, reason);
231362306a36Sopenharmony_ci}
231462306a36Sopenharmony_ci
231562306a36Sopenharmony_civoid wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
231662306a36Sopenharmony_ci				enum wb_reason reason)
231762306a36Sopenharmony_ci{
231862306a36Sopenharmony_ci	rcu_read_lock();
231962306a36Sopenharmony_ci	__wakeup_flusher_threads_bdi(bdi, reason);
232062306a36Sopenharmony_ci	rcu_read_unlock();
232162306a36Sopenharmony_ci}
232262306a36Sopenharmony_ci
232362306a36Sopenharmony_ci/*
232462306a36Sopenharmony_ci * Wakeup the flusher threads to start writeback of all currently dirty pages
232562306a36Sopenharmony_ci */
232662306a36Sopenharmony_civoid wakeup_flusher_threads(enum wb_reason reason)
232762306a36Sopenharmony_ci{
232862306a36Sopenharmony_ci	struct backing_dev_info *bdi;
232962306a36Sopenharmony_ci
233062306a36Sopenharmony_ci	/*
233162306a36Sopenharmony_ci	 * If we are expecting writeback progress we must submit plugged IO.
233262306a36Sopenharmony_ci	 */
233362306a36Sopenharmony_ci	blk_flush_plug(current->plug, true);
233462306a36Sopenharmony_ci
233562306a36Sopenharmony_ci	rcu_read_lock();
233662306a36Sopenharmony_ci	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
233762306a36Sopenharmony_ci		__wakeup_flusher_threads_bdi(bdi, reason);
233862306a36Sopenharmony_ci	rcu_read_unlock();
233962306a36Sopenharmony_ci}
234062306a36Sopenharmony_ci
234162306a36Sopenharmony_ci/*
234262306a36Sopenharmony_ci * Wake up bdi's periodically to make sure dirtytime inodes gets
234362306a36Sopenharmony_ci * written back periodically.  We deliberately do *not* check the
234462306a36Sopenharmony_ci * b_dirtytime list in wb_has_dirty_io(), since this would cause the
234562306a36Sopenharmony_ci * kernel to be constantly waking up once there are any dirtytime
234662306a36Sopenharmony_ci * inodes on the system.  So instead we define a separate delayed work
234762306a36Sopenharmony_ci * function which gets called much more rarely.  (By default, only
234862306a36Sopenharmony_ci * once every 12 hours.)
234962306a36Sopenharmony_ci *
235062306a36Sopenharmony_ci * If there is any other write activity going on in the file system,
235162306a36Sopenharmony_ci * this function won't be necessary.  But if the only thing that has
235262306a36Sopenharmony_ci * happened on the file system is a dirtytime inode caused by an atime
235362306a36Sopenharmony_ci * update, we need this infrastructure below to make sure that inode
235462306a36Sopenharmony_ci * eventually gets pushed out to disk.
235562306a36Sopenharmony_ci */
235662306a36Sopenharmony_cistatic void wakeup_dirtytime_writeback(struct work_struct *w);
235762306a36Sopenharmony_cistatic DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
235862306a36Sopenharmony_ci
235962306a36Sopenharmony_cistatic void wakeup_dirtytime_writeback(struct work_struct *w)
236062306a36Sopenharmony_ci{
236162306a36Sopenharmony_ci	struct backing_dev_info *bdi;
236262306a36Sopenharmony_ci
236362306a36Sopenharmony_ci	rcu_read_lock();
236462306a36Sopenharmony_ci	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
236562306a36Sopenharmony_ci		struct bdi_writeback *wb;
236662306a36Sopenharmony_ci
236762306a36Sopenharmony_ci		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
236862306a36Sopenharmony_ci			if (!list_empty(&wb->b_dirty_time))
236962306a36Sopenharmony_ci				wb_wakeup(wb);
237062306a36Sopenharmony_ci	}
237162306a36Sopenharmony_ci	rcu_read_unlock();
237262306a36Sopenharmony_ci	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
237362306a36Sopenharmony_ci}
237462306a36Sopenharmony_ci
237562306a36Sopenharmony_cistatic int __init start_dirtytime_writeback(void)
237662306a36Sopenharmony_ci{
237762306a36Sopenharmony_ci	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
237862306a36Sopenharmony_ci	return 0;
237962306a36Sopenharmony_ci}
238062306a36Sopenharmony_ci__initcall(start_dirtytime_writeback);
238162306a36Sopenharmony_ci
238262306a36Sopenharmony_ciint dirtytime_interval_handler(struct ctl_table *table, int write,
238362306a36Sopenharmony_ci			       void *buffer, size_t *lenp, loff_t *ppos)
238462306a36Sopenharmony_ci{
238562306a36Sopenharmony_ci	int ret;
238662306a36Sopenharmony_ci
238762306a36Sopenharmony_ci	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
238862306a36Sopenharmony_ci	if (ret == 0 && write)
238962306a36Sopenharmony_ci		mod_delayed_work(system_wq, &dirtytime_work, 0);
239062306a36Sopenharmony_ci	return ret;
239162306a36Sopenharmony_ci}
239262306a36Sopenharmony_ci
239362306a36Sopenharmony_ci/**
239462306a36Sopenharmony_ci * __mark_inode_dirty -	internal function to mark an inode dirty
239562306a36Sopenharmony_ci *
239662306a36Sopenharmony_ci * @inode: inode to mark
239762306a36Sopenharmony_ci * @flags: what kind of dirty, e.g. I_DIRTY_SYNC.  This can be a combination of
239862306a36Sopenharmony_ci *	   multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
239962306a36Sopenharmony_ci *	   with I_DIRTY_PAGES.
240062306a36Sopenharmony_ci *
240162306a36Sopenharmony_ci * Mark an inode as dirty.  We notify the filesystem, then update the inode's
240262306a36Sopenharmony_ci * dirty flags.  Then, if needed we add the inode to the appropriate dirty list.
240362306a36Sopenharmony_ci *
240462306a36Sopenharmony_ci * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
240562306a36Sopenharmony_ci * instead of calling this directly.
240662306a36Sopenharmony_ci *
240762306a36Sopenharmony_ci * CAREFUL!  We only add the inode to the dirty list if it is hashed or if it
240862306a36Sopenharmony_ci * refers to a blockdev.  Unhashed inodes will never be added to the dirty list
240962306a36Sopenharmony_ci * even if they are later hashed, as they will have been marked dirty already.
241062306a36Sopenharmony_ci *
241162306a36Sopenharmony_ci * In short, ensure you hash any inodes _before_ you start marking them dirty.
241262306a36Sopenharmony_ci *
241362306a36Sopenharmony_ci * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
241462306a36Sopenharmony_ci * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
241562306a36Sopenharmony_ci * the kernel-internal blockdev inode represents the dirtying time of the
241662306a36Sopenharmony_ci * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
241762306a36Sopenharmony_ci * page->mapping->host, so the page-dirtying time is recorded in the internal
241862306a36Sopenharmony_ci * blockdev inode.
241962306a36Sopenharmony_ci */
242062306a36Sopenharmony_civoid __mark_inode_dirty(struct inode *inode, int flags)
242162306a36Sopenharmony_ci{
242262306a36Sopenharmony_ci	struct super_block *sb = inode->i_sb;
242362306a36Sopenharmony_ci	int dirtytime = 0;
242462306a36Sopenharmony_ci	struct bdi_writeback *wb = NULL;
242562306a36Sopenharmony_ci
242662306a36Sopenharmony_ci	trace_writeback_mark_inode_dirty(inode, flags);
242762306a36Sopenharmony_ci
242862306a36Sopenharmony_ci	if (flags & I_DIRTY_INODE) {
242962306a36Sopenharmony_ci		/*
243062306a36Sopenharmony_ci		 * Inode timestamp update will piggback on this dirtying.
243162306a36Sopenharmony_ci		 * We tell ->dirty_inode callback that timestamps need to
243262306a36Sopenharmony_ci		 * be updated by setting I_DIRTY_TIME in flags.
243362306a36Sopenharmony_ci		 */
243462306a36Sopenharmony_ci		if (inode->i_state & I_DIRTY_TIME) {
243562306a36Sopenharmony_ci			spin_lock(&inode->i_lock);
243662306a36Sopenharmony_ci			if (inode->i_state & I_DIRTY_TIME) {
243762306a36Sopenharmony_ci				inode->i_state &= ~I_DIRTY_TIME;
243862306a36Sopenharmony_ci				flags |= I_DIRTY_TIME;
243962306a36Sopenharmony_ci			}
244062306a36Sopenharmony_ci			spin_unlock(&inode->i_lock);
244162306a36Sopenharmony_ci		}
244262306a36Sopenharmony_ci
244362306a36Sopenharmony_ci		/*
244462306a36Sopenharmony_ci		 * Notify the filesystem about the inode being dirtied, so that
244562306a36Sopenharmony_ci		 * (if needed) it can update on-disk fields and journal the
244662306a36Sopenharmony_ci		 * inode.  This is only needed when the inode itself is being
244762306a36Sopenharmony_ci		 * dirtied now.  I.e. it's only needed for I_DIRTY_INODE, not
244862306a36Sopenharmony_ci		 * for just I_DIRTY_PAGES or I_DIRTY_TIME.
244962306a36Sopenharmony_ci		 */
245062306a36Sopenharmony_ci		trace_writeback_dirty_inode_start(inode, flags);
245162306a36Sopenharmony_ci		if (sb->s_op->dirty_inode)
245262306a36Sopenharmony_ci			sb->s_op->dirty_inode(inode,
245362306a36Sopenharmony_ci				flags & (I_DIRTY_INODE | I_DIRTY_TIME));
245462306a36Sopenharmony_ci		trace_writeback_dirty_inode(inode, flags);
245562306a36Sopenharmony_ci
245662306a36Sopenharmony_ci		/* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
245762306a36Sopenharmony_ci		flags &= ~I_DIRTY_TIME;
245862306a36Sopenharmony_ci	} else {
245962306a36Sopenharmony_ci		/*
246062306a36Sopenharmony_ci		 * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
246162306a36Sopenharmony_ci		 * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
246262306a36Sopenharmony_ci		 * in one call to __mark_inode_dirty().)
246362306a36Sopenharmony_ci		 */
246462306a36Sopenharmony_ci		dirtytime = flags & I_DIRTY_TIME;
246562306a36Sopenharmony_ci		WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
246662306a36Sopenharmony_ci	}
246762306a36Sopenharmony_ci
246862306a36Sopenharmony_ci	/*
246962306a36Sopenharmony_ci	 * Paired with smp_mb() in __writeback_single_inode() for the
247062306a36Sopenharmony_ci	 * following lockless i_state test.  See there for details.
247162306a36Sopenharmony_ci	 */
247262306a36Sopenharmony_ci	smp_mb();
247362306a36Sopenharmony_ci
247462306a36Sopenharmony_ci	if ((inode->i_state & flags) == flags)
247562306a36Sopenharmony_ci		return;
247662306a36Sopenharmony_ci
247762306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
247862306a36Sopenharmony_ci	if ((inode->i_state & flags) != flags) {
247962306a36Sopenharmony_ci		const int was_dirty = inode->i_state & I_DIRTY;
248062306a36Sopenharmony_ci
248162306a36Sopenharmony_ci		inode_attach_wb(inode, NULL);
248262306a36Sopenharmony_ci
248362306a36Sopenharmony_ci		inode->i_state |= flags;
248462306a36Sopenharmony_ci
248562306a36Sopenharmony_ci		/*
248662306a36Sopenharmony_ci		 * Grab inode's wb early because it requires dropping i_lock and we
248762306a36Sopenharmony_ci		 * need to make sure following checks happen atomically with dirty
248862306a36Sopenharmony_ci		 * list handling so that we don't move inodes under flush worker's
248962306a36Sopenharmony_ci		 * hands.
249062306a36Sopenharmony_ci		 */
249162306a36Sopenharmony_ci		if (!was_dirty) {
249262306a36Sopenharmony_ci			wb = locked_inode_to_wb_and_lock_list(inode);
249362306a36Sopenharmony_ci			spin_lock(&inode->i_lock);
249462306a36Sopenharmony_ci		}
249562306a36Sopenharmony_ci
249662306a36Sopenharmony_ci		/*
249762306a36Sopenharmony_ci		 * If the inode is queued for writeback by flush worker, just
249862306a36Sopenharmony_ci		 * update its dirty state. Once the flush worker is done with
249962306a36Sopenharmony_ci		 * the inode it will place it on the appropriate superblock
250062306a36Sopenharmony_ci		 * list, based upon its state.
250162306a36Sopenharmony_ci		 */
250262306a36Sopenharmony_ci		if (inode->i_state & I_SYNC_QUEUED)
250362306a36Sopenharmony_ci			goto out_unlock;
250462306a36Sopenharmony_ci
250562306a36Sopenharmony_ci		/*
250662306a36Sopenharmony_ci		 * Only add valid (hashed) inodes to the superblock's
250762306a36Sopenharmony_ci		 * dirty list.  Add blockdev inodes as well.
250862306a36Sopenharmony_ci		 */
250962306a36Sopenharmony_ci		if (!S_ISBLK(inode->i_mode)) {
251062306a36Sopenharmony_ci			if (inode_unhashed(inode))
251162306a36Sopenharmony_ci				goto out_unlock;
251262306a36Sopenharmony_ci		}
251362306a36Sopenharmony_ci		if (inode->i_state & I_FREEING)
251462306a36Sopenharmony_ci			goto out_unlock;
251562306a36Sopenharmony_ci
251662306a36Sopenharmony_ci		/*
251762306a36Sopenharmony_ci		 * If the inode was already on b_dirty/b_io/b_more_io, don't
251862306a36Sopenharmony_ci		 * reposition it (that would break b_dirty time-ordering).
251962306a36Sopenharmony_ci		 */
252062306a36Sopenharmony_ci		if (!was_dirty) {
252162306a36Sopenharmony_ci			struct list_head *dirty_list;
252262306a36Sopenharmony_ci			bool wakeup_bdi = false;
252362306a36Sopenharmony_ci
252462306a36Sopenharmony_ci			inode->dirtied_when = jiffies;
252562306a36Sopenharmony_ci			if (dirtytime)
252662306a36Sopenharmony_ci				inode->dirtied_time_when = jiffies;
252762306a36Sopenharmony_ci
252862306a36Sopenharmony_ci			if (inode->i_state & I_DIRTY)
252962306a36Sopenharmony_ci				dirty_list = &wb->b_dirty;
253062306a36Sopenharmony_ci			else
253162306a36Sopenharmony_ci				dirty_list = &wb->b_dirty_time;
253262306a36Sopenharmony_ci
253362306a36Sopenharmony_ci			wakeup_bdi = inode_io_list_move_locked(inode, wb,
253462306a36Sopenharmony_ci							       dirty_list);
253562306a36Sopenharmony_ci
253662306a36Sopenharmony_ci			spin_unlock(&wb->list_lock);
253762306a36Sopenharmony_ci			spin_unlock(&inode->i_lock);
253862306a36Sopenharmony_ci			trace_writeback_dirty_inode_enqueue(inode);
253962306a36Sopenharmony_ci
254062306a36Sopenharmony_ci			/*
254162306a36Sopenharmony_ci			 * If this is the first dirty inode for this bdi,
254262306a36Sopenharmony_ci			 * we have to wake-up the corresponding bdi thread
254362306a36Sopenharmony_ci			 * to make sure background write-back happens
254462306a36Sopenharmony_ci			 * later.
254562306a36Sopenharmony_ci			 */
254662306a36Sopenharmony_ci			if (wakeup_bdi &&
254762306a36Sopenharmony_ci			    (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
254862306a36Sopenharmony_ci				wb_wakeup_delayed(wb);
254962306a36Sopenharmony_ci			return;
255062306a36Sopenharmony_ci		}
255162306a36Sopenharmony_ci	}
255262306a36Sopenharmony_ciout_unlock:
255362306a36Sopenharmony_ci	if (wb)
255462306a36Sopenharmony_ci		spin_unlock(&wb->list_lock);
255562306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
255662306a36Sopenharmony_ci}
255762306a36Sopenharmony_ciEXPORT_SYMBOL(__mark_inode_dirty);
255862306a36Sopenharmony_ci
255962306a36Sopenharmony_ci/*
256062306a36Sopenharmony_ci * The @s_sync_lock is used to serialise concurrent sync operations
256162306a36Sopenharmony_ci * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
256262306a36Sopenharmony_ci * Concurrent callers will block on the s_sync_lock rather than doing contending
256362306a36Sopenharmony_ci * walks. The queueing maintains sync(2) required behaviour as all the IO that
256462306a36Sopenharmony_ci * has been issued up to the time this function is enter is guaranteed to be
256562306a36Sopenharmony_ci * completed by the time we have gained the lock and waited for all IO that is
256662306a36Sopenharmony_ci * in progress regardless of the order callers are granted the lock.
256762306a36Sopenharmony_ci */
256862306a36Sopenharmony_cistatic void wait_sb_inodes(struct super_block *sb)
256962306a36Sopenharmony_ci{
257062306a36Sopenharmony_ci	LIST_HEAD(sync_list);
257162306a36Sopenharmony_ci
257262306a36Sopenharmony_ci	/*
257362306a36Sopenharmony_ci	 * We need to be protected against the filesystem going from
257462306a36Sopenharmony_ci	 * r/o to r/w or vice versa.
257562306a36Sopenharmony_ci	 */
257662306a36Sopenharmony_ci	WARN_ON(!rwsem_is_locked(&sb->s_umount));
257762306a36Sopenharmony_ci
257862306a36Sopenharmony_ci	mutex_lock(&sb->s_sync_lock);
257962306a36Sopenharmony_ci
258062306a36Sopenharmony_ci	/*
258162306a36Sopenharmony_ci	 * Splice the writeback list onto a temporary list to avoid waiting on
258262306a36Sopenharmony_ci	 * inodes that have started writeback after this point.
258362306a36Sopenharmony_ci	 *
258462306a36Sopenharmony_ci	 * Use rcu_read_lock() to keep the inodes around until we have a
258562306a36Sopenharmony_ci	 * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
258662306a36Sopenharmony_ci	 * the local list because inodes can be dropped from either by writeback
258762306a36Sopenharmony_ci	 * completion.
258862306a36Sopenharmony_ci	 */
258962306a36Sopenharmony_ci	rcu_read_lock();
259062306a36Sopenharmony_ci	spin_lock_irq(&sb->s_inode_wblist_lock);
259162306a36Sopenharmony_ci	list_splice_init(&sb->s_inodes_wb, &sync_list);
259262306a36Sopenharmony_ci
259362306a36Sopenharmony_ci	/*
259462306a36Sopenharmony_ci	 * Data integrity sync. Must wait for all pages under writeback, because
259562306a36Sopenharmony_ci	 * there may have been pages dirtied before our sync call, but which had
259662306a36Sopenharmony_ci	 * writeout started before we write it out.  In which case, the inode
259762306a36Sopenharmony_ci	 * may not be on the dirty list, but we still have to wait for that
259862306a36Sopenharmony_ci	 * writeout.
259962306a36Sopenharmony_ci	 */
260062306a36Sopenharmony_ci	while (!list_empty(&sync_list)) {
260162306a36Sopenharmony_ci		struct inode *inode = list_first_entry(&sync_list, struct inode,
260262306a36Sopenharmony_ci						       i_wb_list);
260362306a36Sopenharmony_ci		struct address_space *mapping = inode->i_mapping;
260462306a36Sopenharmony_ci
260562306a36Sopenharmony_ci		/*
260662306a36Sopenharmony_ci		 * Move each inode back to the wb list before we drop the lock
260762306a36Sopenharmony_ci		 * to preserve consistency between i_wb_list and the mapping
260862306a36Sopenharmony_ci		 * writeback tag. Writeback completion is responsible to remove
260962306a36Sopenharmony_ci		 * the inode from either list once the writeback tag is cleared.
261062306a36Sopenharmony_ci		 */
261162306a36Sopenharmony_ci		list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
261262306a36Sopenharmony_ci
261362306a36Sopenharmony_ci		/*
261462306a36Sopenharmony_ci		 * The mapping can appear untagged while still on-list since we
261562306a36Sopenharmony_ci		 * do not have the mapping lock. Skip it here, wb completion
261662306a36Sopenharmony_ci		 * will remove it.
261762306a36Sopenharmony_ci		 */
261862306a36Sopenharmony_ci		if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
261962306a36Sopenharmony_ci			continue;
262062306a36Sopenharmony_ci
262162306a36Sopenharmony_ci		spin_unlock_irq(&sb->s_inode_wblist_lock);
262262306a36Sopenharmony_ci
262362306a36Sopenharmony_ci		spin_lock(&inode->i_lock);
262462306a36Sopenharmony_ci		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
262562306a36Sopenharmony_ci			spin_unlock(&inode->i_lock);
262662306a36Sopenharmony_ci
262762306a36Sopenharmony_ci			spin_lock_irq(&sb->s_inode_wblist_lock);
262862306a36Sopenharmony_ci			continue;
262962306a36Sopenharmony_ci		}
263062306a36Sopenharmony_ci		__iget(inode);
263162306a36Sopenharmony_ci		spin_unlock(&inode->i_lock);
263262306a36Sopenharmony_ci		rcu_read_unlock();
263362306a36Sopenharmony_ci
263462306a36Sopenharmony_ci		/*
263562306a36Sopenharmony_ci		 * We keep the error status of individual mapping so that
263662306a36Sopenharmony_ci		 * applications can catch the writeback error using fsync(2).
263762306a36Sopenharmony_ci		 * See filemap_fdatawait_keep_errors() for details.
263862306a36Sopenharmony_ci		 */
263962306a36Sopenharmony_ci		filemap_fdatawait_keep_errors(mapping);
264062306a36Sopenharmony_ci
264162306a36Sopenharmony_ci		cond_resched();
264262306a36Sopenharmony_ci
264362306a36Sopenharmony_ci		iput(inode);
264462306a36Sopenharmony_ci
264562306a36Sopenharmony_ci		rcu_read_lock();
264662306a36Sopenharmony_ci		spin_lock_irq(&sb->s_inode_wblist_lock);
264762306a36Sopenharmony_ci	}
264862306a36Sopenharmony_ci	spin_unlock_irq(&sb->s_inode_wblist_lock);
264962306a36Sopenharmony_ci	rcu_read_unlock();
265062306a36Sopenharmony_ci	mutex_unlock(&sb->s_sync_lock);
265162306a36Sopenharmony_ci}
265262306a36Sopenharmony_ci
265362306a36Sopenharmony_cistatic void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
265462306a36Sopenharmony_ci				     enum wb_reason reason, bool skip_if_busy)
265562306a36Sopenharmony_ci{
265662306a36Sopenharmony_ci	struct backing_dev_info *bdi = sb->s_bdi;
265762306a36Sopenharmony_ci	DEFINE_WB_COMPLETION(done, bdi);
265862306a36Sopenharmony_ci	struct wb_writeback_work work = {
265962306a36Sopenharmony_ci		.sb			= sb,
266062306a36Sopenharmony_ci		.sync_mode		= WB_SYNC_NONE,
266162306a36Sopenharmony_ci		.tagged_writepages	= 1,
266262306a36Sopenharmony_ci		.done			= &done,
266362306a36Sopenharmony_ci		.nr_pages		= nr,
266462306a36Sopenharmony_ci		.reason			= reason,
266562306a36Sopenharmony_ci	};
266662306a36Sopenharmony_ci
266762306a36Sopenharmony_ci	if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
266862306a36Sopenharmony_ci		return;
266962306a36Sopenharmony_ci	WARN_ON(!rwsem_is_locked(&sb->s_umount));
267062306a36Sopenharmony_ci
267162306a36Sopenharmony_ci	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
267262306a36Sopenharmony_ci	wb_wait_for_completion(&done);
267362306a36Sopenharmony_ci}
267462306a36Sopenharmony_ci
267562306a36Sopenharmony_ci/**
267662306a36Sopenharmony_ci * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
267762306a36Sopenharmony_ci * @sb: the superblock
267862306a36Sopenharmony_ci * @nr: the number of pages to write
267962306a36Sopenharmony_ci * @reason: reason why some writeback work initiated
268062306a36Sopenharmony_ci *
268162306a36Sopenharmony_ci * Start writeback on some inodes on this super_block. No guarantees are made
268262306a36Sopenharmony_ci * on how many (if any) will be written, and this function does not wait
268362306a36Sopenharmony_ci * for IO completion of submitted IO.
268462306a36Sopenharmony_ci */
268562306a36Sopenharmony_civoid writeback_inodes_sb_nr(struct super_block *sb,
268662306a36Sopenharmony_ci			    unsigned long nr,
268762306a36Sopenharmony_ci			    enum wb_reason reason)
268862306a36Sopenharmony_ci{
268962306a36Sopenharmony_ci	__writeback_inodes_sb_nr(sb, nr, reason, false);
269062306a36Sopenharmony_ci}
269162306a36Sopenharmony_ciEXPORT_SYMBOL(writeback_inodes_sb_nr);
269262306a36Sopenharmony_ci
269362306a36Sopenharmony_ci/**
269462306a36Sopenharmony_ci * writeback_inodes_sb	-	writeback dirty inodes from given super_block
269562306a36Sopenharmony_ci * @sb: the superblock
269662306a36Sopenharmony_ci * @reason: reason why some writeback work was initiated
269762306a36Sopenharmony_ci *
269862306a36Sopenharmony_ci * Start writeback on some inodes on this super_block. No guarantees are made
269962306a36Sopenharmony_ci * on how many (if any) will be written, and this function does not wait
270062306a36Sopenharmony_ci * for IO completion of submitted IO.
270162306a36Sopenharmony_ci */
270262306a36Sopenharmony_civoid writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
270362306a36Sopenharmony_ci{
270462306a36Sopenharmony_ci	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
270562306a36Sopenharmony_ci}
270662306a36Sopenharmony_ciEXPORT_SYMBOL(writeback_inodes_sb);
270762306a36Sopenharmony_ci
270862306a36Sopenharmony_ci/**
270962306a36Sopenharmony_ci * try_to_writeback_inodes_sb - try to start writeback if none underway
271062306a36Sopenharmony_ci * @sb: the superblock
271162306a36Sopenharmony_ci * @reason: reason why some writeback work was initiated
271262306a36Sopenharmony_ci *
271362306a36Sopenharmony_ci * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
271462306a36Sopenharmony_ci */
271562306a36Sopenharmony_civoid try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
271662306a36Sopenharmony_ci{
271762306a36Sopenharmony_ci	if (!down_read_trylock(&sb->s_umount))
271862306a36Sopenharmony_ci		return;
271962306a36Sopenharmony_ci
272062306a36Sopenharmony_ci	__writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
272162306a36Sopenharmony_ci	up_read(&sb->s_umount);
272262306a36Sopenharmony_ci}
272362306a36Sopenharmony_ciEXPORT_SYMBOL(try_to_writeback_inodes_sb);
272462306a36Sopenharmony_ci
272562306a36Sopenharmony_ci/**
272662306a36Sopenharmony_ci * sync_inodes_sb	-	sync sb inode pages
272762306a36Sopenharmony_ci * @sb: the superblock
272862306a36Sopenharmony_ci *
272962306a36Sopenharmony_ci * This function writes and waits on any dirty inode belonging to this
273062306a36Sopenharmony_ci * super_block.
273162306a36Sopenharmony_ci */
273262306a36Sopenharmony_civoid sync_inodes_sb(struct super_block *sb)
273362306a36Sopenharmony_ci{
273462306a36Sopenharmony_ci	struct backing_dev_info *bdi = sb->s_bdi;
273562306a36Sopenharmony_ci	DEFINE_WB_COMPLETION(done, bdi);
273662306a36Sopenharmony_ci	struct wb_writeback_work work = {
273762306a36Sopenharmony_ci		.sb		= sb,
273862306a36Sopenharmony_ci		.sync_mode	= WB_SYNC_ALL,
273962306a36Sopenharmony_ci		.nr_pages	= LONG_MAX,
274062306a36Sopenharmony_ci		.range_cyclic	= 0,
274162306a36Sopenharmony_ci		.done		= &done,
274262306a36Sopenharmony_ci		.reason		= WB_REASON_SYNC,
274362306a36Sopenharmony_ci		.for_sync	= 1,
274462306a36Sopenharmony_ci	};
274562306a36Sopenharmony_ci
274662306a36Sopenharmony_ci	/*
274762306a36Sopenharmony_ci	 * Can't skip on !bdi_has_dirty() because we should wait for !dirty
274862306a36Sopenharmony_ci	 * inodes under writeback and I_DIRTY_TIME inodes ignored by
274962306a36Sopenharmony_ci	 * bdi_has_dirty() need to be written out too.
275062306a36Sopenharmony_ci	 */
275162306a36Sopenharmony_ci	if (bdi == &noop_backing_dev_info)
275262306a36Sopenharmony_ci		return;
275362306a36Sopenharmony_ci	WARN_ON(!rwsem_is_locked(&sb->s_umount));
275462306a36Sopenharmony_ci
275562306a36Sopenharmony_ci	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
275662306a36Sopenharmony_ci	bdi_down_write_wb_switch_rwsem(bdi);
275762306a36Sopenharmony_ci	bdi_split_work_to_wbs(bdi, &work, false);
275862306a36Sopenharmony_ci	wb_wait_for_completion(&done);
275962306a36Sopenharmony_ci	bdi_up_write_wb_switch_rwsem(bdi);
276062306a36Sopenharmony_ci
276162306a36Sopenharmony_ci	wait_sb_inodes(sb);
276262306a36Sopenharmony_ci}
276362306a36Sopenharmony_ciEXPORT_SYMBOL(sync_inodes_sb);
276462306a36Sopenharmony_ci
276562306a36Sopenharmony_ci/**
276662306a36Sopenharmony_ci * write_inode_now	-	write an inode to disk
276762306a36Sopenharmony_ci * @inode: inode to write to disk
276862306a36Sopenharmony_ci * @sync: whether the write should be synchronous or not
276962306a36Sopenharmony_ci *
277062306a36Sopenharmony_ci * This function commits an inode to disk immediately if it is dirty. This is
277162306a36Sopenharmony_ci * primarily needed by knfsd.
277262306a36Sopenharmony_ci *
277362306a36Sopenharmony_ci * The caller must either have a ref on the inode or must have set I_WILL_FREE.
277462306a36Sopenharmony_ci */
277562306a36Sopenharmony_ciint write_inode_now(struct inode *inode, int sync)
277662306a36Sopenharmony_ci{
277762306a36Sopenharmony_ci	struct writeback_control wbc = {
277862306a36Sopenharmony_ci		.nr_to_write = LONG_MAX,
277962306a36Sopenharmony_ci		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
278062306a36Sopenharmony_ci		.range_start = 0,
278162306a36Sopenharmony_ci		.range_end = LLONG_MAX,
278262306a36Sopenharmony_ci	};
278362306a36Sopenharmony_ci
278462306a36Sopenharmony_ci	if (!mapping_can_writeback(inode->i_mapping))
278562306a36Sopenharmony_ci		wbc.nr_to_write = 0;
278662306a36Sopenharmony_ci
278762306a36Sopenharmony_ci	might_sleep();
278862306a36Sopenharmony_ci	return writeback_single_inode(inode, &wbc);
278962306a36Sopenharmony_ci}
279062306a36Sopenharmony_ciEXPORT_SYMBOL(write_inode_now);
279162306a36Sopenharmony_ci
279262306a36Sopenharmony_ci/**
279362306a36Sopenharmony_ci * sync_inode_metadata - write an inode to disk
279462306a36Sopenharmony_ci * @inode: the inode to sync
279562306a36Sopenharmony_ci * @wait: wait for I/O to complete.
279662306a36Sopenharmony_ci *
279762306a36Sopenharmony_ci * Write an inode to disk and adjust its dirty state after completion.
279862306a36Sopenharmony_ci *
279962306a36Sopenharmony_ci * Note: only writes the actual inode, no associated data or other metadata.
280062306a36Sopenharmony_ci */
280162306a36Sopenharmony_ciint sync_inode_metadata(struct inode *inode, int wait)
280262306a36Sopenharmony_ci{
280362306a36Sopenharmony_ci	struct writeback_control wbc = {
280462306a36Sopenharmony_ci		.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
280562306a36Sopenharmony_ci		.nr_to_write = 0, /* metadata-only */
280662306a36Sopenharmony_ci	};
280762306a36Sopenharmony_ci
280862306a36Sopenharmony_ci	return writeback_single_inode(inode, &wbc);
280962306a36Sopenharmony_ci}
281062306a36Sopenharmony_ciEXPORT_SYMBOL(sync_inode_metadata);
2811